Compare commits

..

21 Commits

Author SHA1 Message Date
Charlotte.Bonte
e817c9dcab make the parameters dependent on how many multiplications are performed 2024-03-26 14:44:39 +01:00
Charlotte.Bonte
e1dcf4fb7e make the parameters dependent on how many multiplications are performed 2024-03-26 14:02:57 +01:00
Charlotte.Bonte
1e47e3eb3f make the parameters dependent on how many multiplications are performed 2024-03-26 13:30:53 +01:00
Charlotte.Bonte
032c89453d make the parameters dependent on how many multiplications are performed 2024-03-26 12:22:04 +01:00
Charlotte.Bonte
b14a38bd18 make the parameters dependent on how many multiplications are performed 2024-03-26 11:55:19 +01:00
Charlotte.Bonte
7874ff847e make the parameters dependent on how many multiplications are performed 2024-03-26 11:03:51 +01:00
Charlotte.Bonte
1db67ad90c change number of multiplications 2024-03-21 16:02:04 +01:00
Charlotte.Bonte
737d6be4f1 change number of multiplications 2024-03-21 15:20:41 +01:00
Charlotte.Bonte
73654cd5d7 add multiple multiplications to the square trick circuit, disable benching for ks 2024-03-21 11:37:27 +01:00
Charlotte.Bonte
49f3b47e02 lower the number of samples for the benches 2024-03-21 08:07:00 +01:00
Charlotte.Bonte
83eb426c41 change the number of samples packed in the packing KS and disable the benching for the key switching 2024-03-21 08:04:10 +01:00
Charlotte.Bonte
93a8de5e39 add all benching scenarios 2024-03-20 15:54:18 +01:00
Charlotte.Bonte
0d2ab6002f add key switching to square trick, add packing ks to scenario A and C 2024-03-20 15:52:52 +01:00
Charlotte.Bonte
729cf09699 fix benches with Arthur: make sure inputs and keys are non-trivial ciphertexts so the operations are actually performed 2024-03-19 11:41:52 +01:00
Charlotte.Bonte
0ebe0dd99a benches for clot21 vs square trick 2024-03-18 10:06:42 +01:00
Charlotte.Bonte
6c2768f5c2 write unit test for clot21 mult, make test_core_crypto only runs this test, add doctest for tensor product + relin 2024-03-13 09:46:55 +01:00
Arthur Meyre
5fc1f709cc fix buffer size for lwe sample extraction 2024-03-08 17:32:23 +01:00
Charlotte.Bonte
fa29b56d5f issue dimension during sample extraction 2024-03-08 17:06:23 +01:00
Charlotte.Bonte
0cd8ee773f writing benches for comparison clot21 with square trick 2024-03-08 10:03:01 +01:00
Charlotte.Bonte
760f78314e wip: add benchmarks for clot21 mult 2024-02-22 17:31:22 +01:00
Carl-Zama
4dd1a5a77b feat(tfhe): add glwe keyswitch, glwe tensor product, trace packing keyswitch and public functional packing keyswitch 2024-02-19 13:20:37 +01:00
908 changed files with 57270 additions and 102037 deletions

View File

@@ -1,9 +0,0 @@
self-hosted-runner:
# Labels of self-hosted runner in array of strings.
labels:
- m1mac
- 4090-desktop
# Configuration variables in array of strings defined in your repository or
# organization. `null` means disabling configuration variables check.
# Empty array means no configuration variable is allowed.
config-variables: null

View File

@@ -1,5 +1,5 @@
# Add labels in pull request
name: PR label manager
# Manage approved label in pull request
name: PR approved label manager
on:
pull_request:
@@ -21,16 +21,14 @@ jobs:
uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
with:
# We use a PAT to have the same user (zama-bot) for label deletion as for creation.
github_token: ${{ secrets.FHE_ACTIONS_TOKEN }}
github_token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
labels: approved
# Add label only if the review is approved and if the label doesn't already exist
- name: Add approved label
uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf
if: ${{ github.event_name == 'pull_request_review'
&& github.event.review.state == 'approved'
&& !contains(fromJSON(env.LABELS), 'approved') }}
if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
with:
# We need to use a PAT to be able to trigger `labeled` event for the other workflow.
github_token: ${{ secrets.FHE_ACTIONS_TOKEN }}
github_token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
labels: approved

View File

@@ -6,7 +6,6 @@ env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
@@ -18,42 +17,40 @@ on:
pull_request:
jobs:
setup-instance:
name: Setup instance (fast-tests)
setup-ec2:
name: Setup EC2 instance (fast-tests)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: cpu-big
fast-tests:
name: Fast CPU tests
needs: setup-instance
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
@@ -61,10 +58,6 @@ jobs:
run: |
make test_concrete_csprng
- name: Run tfhe-zk-pok tests
run: |
make test_zk_pok
- name: Run core tests
run: |
AVX512_SUPPORT=ON make test_core_crypto
@@ -112,31 +105,32 @@ jobs:
- name: Slack Notification
if: ${{ always() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (fast-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, fast-tests ]
teardown-ec2:
name: Teardown EC2 instance (fast-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, fast-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
profile: cpu-big
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (fast-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "EC2 teardown (fast-tests) failed. (${{ env.ACTION_RUN_URL }})"

View File

@@ -6,7 +6,6 @@ env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
@@ -16,7 +15,7 @@ on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled ]
types: [labeled]
jobs:
cuda-tests-linux:
@@ -29,12 +28,10 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
- name: Install latest stable
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
@@ -44,9 +41,9 @@ jobs:
- name: Run clippy checks
run: |
make pcc_gpu
make clippy_gpu
- name: Run core crypto, integer and internal CUDA backend tests
- name: Run all tests
run: |
make test_gpu
@@ -58,12 +55,8 @@ jobs:
run: |
make test_c_api_gpu
- name: Run High Level API Tests
run: |
make test_high_level_api_gpu
- uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
if: ${{ always() && github.event_name == 'pull_request' }}
if: ${{ github.event_name == 'pull_request' }}
with:
labels: 4090_test
github_token: ${{ secrets.GITHUB_TOKEN }}
@@ -71,7 +64,7 @@ jobs:
- name: Slack Notification
if: ${{ always() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "CUDA RTX 4090 tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -6,7 +6,6 @@ env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
@@ -18,30 +17,30 @@ on:
pull_request:
jobs:
setup-instance:
name: Setup instance (cuda-tests)
setup-ec2:
name: Setup EC2 instance (cuda-tests)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: gpu-test
cuda-pcc:
name: CUDA post-commit checks
needs: setup-instance
cuda-tests-linux:
name: CUDA tests
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
@@ -55,16 +54,14 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
@@ -80,12 +77,10 @@ jobs:
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Run fmt checks
run: |
@@ -93,69 +88,9 @@ jobs:
- name: Run clippy checks
run: |
make pcc_gpu
make clippy_gpu
- name: Slack Notification
if: ${{ always() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "CUDA AWS post-commit checks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
cuda-tests-linux:
name: CUDA tests
needs: [ setup-instance, cuda-pcc ]
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: stable
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Run core crypto, integer and internal CUDA backend tests
- name: Run all tests
run: |
make test_gpu
@@ -167,38 +102,36 @@ jobs:
run: |
make test_c_api_gpu
- name: Run High Level API Tests
run: |
make test_high_level_api_gpu
- name: Slack Notification
if: ${{ always() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "CUDA AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-pcc, cuda-tests-linux ]
teardown-ec2:
name: Teardown EC2 instance (cuda-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
profile: gpu-test
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "EC2 teardown (cuda-tests) failed. (${{ env.ACTION_RUN_URL }})"

View File

@@ -5,14 +5,10 @@ env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
# We clear the cache to reduce memory pressure because of the numerous processes of cargo
# nextest
TFHE_RS_CLEAR_IN_MEMORY_KEY_CACHE: "1"
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -21,43 +17,41 @@ on:
types: [ labeled ]
jobs:
setup-instance:
name: Setup instance (unsigned-integer-tests)
setup-ec2:
name: Setup EC2 instance (unsigned-integer-tests)
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: cpu-big
unsigned-integer-tests:
name: Unsigned integer tests
needs: setup-instance
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
@@ -80,31 +74,32 @@ jobs:
- name: Slack Notification
if: ${{ always() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Unsigned Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (unsigned-integer-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, unsigned-integer-tests ]
teardown-ec2:
name: Teardown EC2 instance (unsigned-integer-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, unsigned-integer-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
profile: cpu-big
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "EC2 teardown (unsigned-integer-tests) failed. (${{ env.ACTION_RUN_URL }})"

View File

@@ -1,134 +0,0 @@
# Compile and test tfhe-cuda-backend on an AWS instance
name: TFHE Cuda Backend - Full tests multi-GPU
env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
jobs:
setup-instance:
name: Setup instance (cuda-tests-multi-gpu)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: multi-gpu-test
cuda-tests-linux:
name: CUDA multi-GPU tests
needs: [ setup-instance ]
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: stable
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Run core crypto, integer and internal CUDA backend tests
run: |
make test_gpu
- name: Run user docs tests
run: |
make test_user_doc_gpu
- name: Test C API
run: |
make test_c_api_gpu
- name: Run High Level API Tests
run: |
make test_high_level_api_gpu
- name: Slack Notification
if: ${{ always() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "CUDA AWS multi-GPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-tests-multi-gpu)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-tests-multi-gpu) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -5,14 +5,10 @@ env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
# We clear the cache to reduce memory pressure because of the numerous processes of cargo
# nextest
TFHE_RS_CLEAR_IN_MEMORY_KEY_CACHE: "1"
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -21,43 +17,41 @@ on:
types: [ labeled ]
jobs:
setup-instance:
name: Setup instance (signed-integer-tests)
setup-ec2:
name: Setup EC2 instance (signed-integer-tests)
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: cpu-big
signed-integer-tests:
name: Signed integer tests
needs: setup-instance
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
@@ -84,31 +78,32 @@ jobs:
- name: Slack Notification
if: ${{ always() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Signed Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (signed-integer-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, signed-integer-tests ]
teardown-ec2:
name: Teardown EC2 instance (signed-integer-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, signed-integer-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
profile: cpu-big
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "EC2 teardown (signed-integer-tests) failed. (${{ env.ACTION_RUN_URL }})"

View File

@@ -5,216 +5,94 @@ env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled ]
schedule:
# Nightly tests @ 1AM after each work day
- cron: "0 1 * * MON-FRI"
jobs:
should-run:
runs-on: ubuntu-latest
if: github.event_name != 'schedule' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
permissions:
pull-requests: write
outputs:
csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
core_crypto_test: ${{ env.IS_PULL_REQUEST == 'false' ||
steps.changed-files.outputs.core_crypto_any_changed ||
steps.changed-files.outputs.dependencies_any_changed }}
boolean_test: ${{ env.IS_PULL_REQUEST == 'false' ||
steps.changed-files.outputs.boolean_any_changed ||
steps.changed-files.outputs.dependencies_any_changed }}
shortint_test: ${{ env.IS_PULL_REQUEST == 'false' ||
steps.changed-files.outputs.shortint_any_changed ||
steps.changed-files.outputs.dependencies_any_changed }}
high_level_api_test: ${{ env.IS_PULL_REQUEST == 'false' ||
steps.changed-files.outputs.high_level_api_any_changed ||
steps.changed-files.outputs.dependencies_any_changed }}
c_api_test: ${{ env.IS_PULL_REQUEST == 'false' ||
steps.changed-files.outputs.c_api_any_changed ||
steps.changed-files.outputs.dependencies_any_changed }}
examples_test: ${{ env.IS_PULL_REQUEST == 'false' ||
steps.changed-files.outputs.examples_any_changed ||
steps.changed-files.outputs.dependencies_any_changed }}
apps_test: ${{ env.IS_PULL_REQUEST == 'false' ||
steps.changed-files.outputs.apps_any_changed || steps.changed-files.outputs.dependencies_any_changed }}
user_docs_test: ${{ env.IS_PULL_REQUEST == 'false' ||
steps.changed-files.outputs.user_docs_any_changed ||
steps.changed-files.outputs.dependencies_any_changed }}
any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@d6babd6899969df1a11d14c368283ea4436bca78
with:
since_last_remote_commit: true
files_yaml: |
dependencies:
- tfhe/Cargo.toml
- concrete-csprng/**
- tfhe-zk-pok/**
csprng:
- concrete-csprng/**
zk_pok:
- tfhe-zk-pok/**
core_crypto:
- tfhe/src/core_crypto/**
boolean:
- tfhe/src/core_crypto/**
- tfhe/src/boolean/**
shortint:
- tfhe/src/core_crypto/**
- tfhe/src/shortint/**
high_level_api:
- tfhe/src/**
- '!tfhe/src/c_api/**'
c_api:
- tfhe/src/**
examples:
- tfhe/src/**
- '!tfhe/src/c_api/**'
- tfhe/examples/**
apps:
- tfhe/src/**
- '!tfhe/src/c_api/**'
- apps/trivium/src/**
user_docs:
- tfhe/src/**
- '!tfhe/src/c_api/**'
- 'tfhe/docs/**.md'
- README.md
- name: Aggregate file changes
id: aggregated-changes
if: ( steps.changed-files.outputs.dependencies_any_changed == 'true' ||
steps.changed-files.outputs.csprng_any_changed == 'true' ||
steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
steps.changed-files.outputs.boolean_any_changed == 'true' ||
steps.changed-files.outputs.shortint_any_changed == 'true' ||
steps.changed-files.outputs.high_level_api_any_changed == 'true' ||
steps.changed-files.outputs.c_api_any_changed == 'true' ||
steps.changed-files.outputs.examples_any_changed == 'true' ||
steps.changed-files.outputs.apps_any_changed == 'true' ||
steps.changed-files.outputs.user_docs_any_changed == 'true')
run: |
echo "any_changed=true" >> "$GITHUB_OUTPUT"
setup-instance:
name: Setup instance (cpu-tests)
if: github.event_name != 'pull_request' ||
(github.event_name == 'pull_request' && needs.should-run.outputs.any_file_changed == 'true')
needs: should-run
setup-ec2:
name: Setup EC2 instance (cpu-tests)
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: cpu-big
cpu-tests:
name: CPU tests
if: github.event_name != 'pull_request' ||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
needs: [ should-run, setup-instance ]
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
- name: Run concrete-csprng tests
if: needs.should-run.outputs.csprng_test == 'true'
run: |
make test_concrete_csprng
- name: Run tfhe-zk-pok tests
if: needs.should-run.outputs.zk_pok_test == 'true'
run: |
make test_zk_pok
- name: Run core tests
if: needs.should-run.outputs.core_crypto_test == 'true'
run: |
AVX512_SUPPORT=ON make test_core_crypto
- name: Run boolean tests
if: needs.should-run.outputs.boolean_test == 'true'
run: |
make test_boolean
- name: Run C API tests
if: needs.should-run.outputs.c_api_test == 'true'
run: |
make test_c_api
- name: Run user docs tests
if: needs.should-run.outputs.user_docs_test == 'true'
run: |
make test_user_doc
- name: Gen Keys if required
if: needs.should-run.outputs.shortint_test == 'true'
run: |
make gen_key_cache
- name: Run shortint tests
if: needs.should-run.outputs.shortint_test == 'true'
run: |
BIG_TESTS_INSTANCE=TRUE make test_shortint_ci
- name: Run high-level API tests
if: needs.should-run.outputs.high_level_api_test == 'true'
run: |
BIG_TESTS_INSTANCE=TRUE make test_high_level_api
- name: Run example tests
if: needs.should-run.outputs.examples_test == 'true'
run: |
make test_examples
make dark_market
- name: Run apps tests
if: needs.should-run.outputs.apps_test == 'true'
run: |
make test_trivium
make test_kreyvium
@@ -222,31 +100,32 @@ jobs:
- name: Slack Notification
if: ${{ always() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "CPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cpu-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cpu-tests ]
teardown-ec2:
name: Teardown EC2 instance (cpu-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, cpu-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
profile: cpu-big
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cpu-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "EC2 teardown (cpu-tests) failed. (${{ env.ACTION_RUN_URL }})"

View File

@@ -5,7 +5,6 @@ env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
@@ -18,90 +17,82 @@ on:
types: [ labeled ]
jobs:
setup-instance:
name: Setup instance (wasm-tests)
setup-ec2:
name: Setup EC2 instance (wasm-tests)
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: cpu-small
wasm-tests:
name: WASM tests
needs: setup-instance
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
- name: Install Node
run: |
make install_node
- name: Run fmt checks
run: |
make check_fmt_js
- name: Run js on wasm API tests
run: |
make test_nodejs_wasm_api_in_docker
- name: Run parallel wasm tests
run: |
make install_node
make ci_test_web_js_api_parallel
- name: Slack Notification
if: ${{ always() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "WASM tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (wasm-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, wasm-tests ]
teardown-ec2:
name: Teardown EC2 instance (wasm-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, wasm-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
profile: cpu-small
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (wasm-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "EC2 teardown (wasm-tests) failed. (${{ env.ACTION_RUN_URL }})"

View File

@@ -33,7 +33,6 @@ env:
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-boolean-benchmarks:
@@ -53,7 +52,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
@@ -63,7 +62,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
@@ -97,17 +96,17 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_boolean
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
@@ -126,11 +125,11 @@ jobs:
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Boolean benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "Boolean benchmarks failed. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -7,7 +7,6 @@ env:
CARGO_TERM_COLOR: always
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}
@@ -23,7 +22,7 @@ jobs:
fail-fast: false
steps:
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
- name: Install and run newline linter checks
if: matrix.os == 'ubuntu-latest'
@@ -68,9 +67,5 @@ jobs:
run: |
make build_c_api
- name: Build coverage tests
run: |
make build_tfhe_coverage
# The wasm build check is a bit annoying to set-up here and is done during the tests in
# aws_tfhe_tests.yml

View File

@@ -10,7 +10,7 @@ jobs:
- name: Check first line
uses: gsactions/commit-message-checker@16fa2d5de096ae0d35626443bcd24f1e756cafee
with:
pattern: '^((feat|fix|chore|refactor|style|test|docs|doc)(\([\w\-_]+\))?\:) .+$'
pattern: '^((feat|fix|chore|refactor|style|test|docs|doc)(\(\w+\))?\:) .+$'
flags: "gs"
error: 'Your first line has to contain a commit type and scope like "feat(my_feature): msg".'
excludeDescription: "true" # optional: this excludes the description body of a pull request

View File

@@ -1,27 +0,0 @@
# Lint and check CI
name: CI Lint and Checks
on:
pull_request:
env:
ACTIONLINT_VERSION: 1.6.27
jobs:
lint-check:
name: Lint and checks
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
- name: Get actionlint
run: |
bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash) ${{ env.ACTIONLINT_VERSION }}
echo "f2ee6d561ce00fa93aab62a7791c1a0396ec7e8876b2a8f2057475816c550782 actionlint" > checksum
sha256sum -c checksum
ln -s "$(pwd)/actionlint" /usr/local/bin/
- name: Lint workflows
run: |
make lint_workflow

View File

@@ -5,59 +5,70 @@ env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
# Code coverage workflow is only run via workflow_dispatch event since execution duration is not stabilized yet.
# All the inputs are provided by Slab
inputs:
instance_id:
description: "AWS instance ID"
type: string
instance_image_id:
description: "AWS instance AMI ID"
type: string
instance_type:
description: "AWS instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: 'Slab request ID'
type: string
fork_repo:
description: 'Name of forked repo as user/repo'
type: string
fork_git_sha:
description: 'Git SHA to checkout from fork'
type: string
jobs:
setup-instance:
name: Setup instance (code-coverage)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: cpu-small
code-coverage:
name: Code coverage tests
needs: setup-instance
concurrency:
group: ${{ github.workflow }}_${{ github.event_name }}_${{ github.ref }}
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
cancel-in-progress: true
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
timeout-minutes: 5760 # 4 days
runs-on: ${{ inputs.runner_name }}
timeout-minutes: 1080
steps:
# Step used for log purpose.
- name: Instance configuration used
run: |
echo "ID: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
echo "Fork repo: ${{ inputs.fork_repo }}"
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: ${{ inputs.fork_repo }}
ref: ${{ inputs.fork_git_sha }}
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@d6babd6899969df1a11d14c368283ea4436bca78
uses: tj-actions/changed-files@ec75ae5ab7296b81fd4cddb77294d6718932ebab
with:
files_yaml: |
tfhe:
@@ -87,7 +98,7 @@ jobs:
make test_shortint_cov
- name: Upload tfhe coverage to Codecov
uses: codecov/codecov-action@e28ff129e5465c2c0dcc6f003fc735cb6ae0c673
uses: codecov/codecov-action@e0b68c6749509c5f83f984dd99a76a1c1a231044
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
with:
token: ${{ secrets.CODECOV_TOKEN }}
@@ -95,48 +106,14 @@ jobs:
fail_ci_if_error: true
files: shortint/cobertura.xml,boolean/cobertura.xml,core_crypto/cobertura.xml,core_crypto_avx512/cobertura.xml
- name: Run integer coverage
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
run: |
make test_integer_cov
- name: Upload tfhe coverage to Codecov
uses: codecov/codecov-action@e28ff129e5465c2c0dcc6f003fc735cb6ae0c673
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
with:
token: ${{ secrets.CODECOV_TOKEN }}
directory: ./coverage/
fail_ci_if_error: true
files: integer/cobertura.xml
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Code coverage finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (code-coverage)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, code-coverage ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (code-coverage) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -33,7 +33,6 @@ env:
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-core-crypto-benchmarks:
@@ -53,7 +52,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
@@ -63,14 +62,13 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
- name: Run benchmarks with AVX512
run: |
make bench_pbs
make bench_pbs128
make bench_ks
- name: Parse results
@@ -89,17 +87,17 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
@@ -118,11 +116,11 @@ jobs:
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "PBS benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "PBS benchmarks failed. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -1,45 +1,43 @@
# Run core crypto benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
# Run core crypto benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
name: Core crypto GPU benchmarks
on:
workflow_dispatch:
schedule:
# Weekly benchmarks will be triggered each Saturday at 1a.m.
- cron: '0 1 * * 6'
inputs:
instance_id:
description: "Instance ID"
type: string
instance_image_id:
description: "Instance AMI ID"
type: string
instance_type:
description: "Instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
# This input is not used in this workflow but still mandatory since a calling workflow could
# use it. If a triggering command include a user_inputs field, then the triggered workflow
# must include this very input, otherwise the workflow won't be called.
# See start_full_benchmarks.yml as example.
user_inputs:
description: "Type of benchmarks to run"
type: string
default: "weekly_benchmarks"
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
jobs:
setup-instance:
name: Setup instance (cuda-core-crypto-benchmarks)
runs-on: ubuntu-latest
if: github.event_name != 'schedule' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: single-h100
cuda-core-crypto-benchmarks:
name: Execute GPU core crypto benchmarks
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
run-core-crypto-benchmarks:
name: Execute GPU core crypto benchmarks in EC2
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
@@ -47,66 +45,52 @@ jobs:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.1
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
- name: Instance configuration used
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
- name: Get benchmark date
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
{
echo "CUDA_PATH=$CUDA_PATH";
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
} >> "${GITHUB_ENV}"
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Run benchmarks with AVX512
run: |
@@ -115,30 +99,32 @@ jobs:
- name: Parse results
run: |
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "n3-H100x1" \
--hardware ${{ inputs.instance_type }} \
--backend gpu \
--project-version "${{ env.COMMIT_HASH }}" \
--project-version "${COMMIT_HASH}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--commit-date "${COMMIT_DATE}" \
--bench-date "${{ env.BENCH_DATE }}" \
--name-suffix avx512 \
--walk-subdirs \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
@@ -154,39 +140,14 @@ jobs:
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-core-crypto-benchmarks ]
runs-on: ubuntu-latest
if: ${{ !success() && !cancelled() }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ needs.cuda-core-crypto-benchmarks.result }}
SLACK_MESSAGE: "PBS GPU benchmarks finished with status: ${{ needs.cuda-core-crypto-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-integer-full-benchmarks)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-core-crypto-benchmarks, slack-notify ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-core-crypto-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "PBS GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -5,7 +5,6 @@ env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
@@ -17,44 +16,43 @@ on:
pull_request:
types: [ labeled ]
jobs:
setup-instance:
name: Setup instance (csprng-randomness-tests)
setup-ec2:
name: Setup EC2 instance (csprng-randomness-tests)
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: cpu-small
csprng-randomness-tests:
name: CSPRNG randomness tests
needs: setup-instance
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
@@ -65,31 +63,32 @@ jobs:
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (csprng-randomness-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, csprng-randomness-tests ]
teardown-ec2:
name: Teardown EC2 instance (csprng-randomness-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, csprng-randomness-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
profile: cpu-small
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (csprng-randomness-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "EC2 teardown (csprng-randomness-tests) failed. (${{ env.ACTION_RUN_URL }})"

View File

@@ -1,202 +0,0 @@
# Run all benchmarks on an RTX 4090 machine and return parsed results to Slab CI bot.
name: TFHE Cuda Backend - 4090 full benchmarks
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [labeled]
schedule:
# Weekly benchmarks will be triggered each Friday at 9p.m.
- cron: "0 21 * * 5"
jobs:
cuda-integer-benchmarks:
name: Cuda integer benchmarks for all operations flavor (RTX 4090)
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}_cuda_integer_bench
cancel-in-progress: true
runs-on: ["self-hosted", "4090-desktop"]
timeout-minutes: 1440 # 24 hours
strategy:
fail-fast: false
max-parallel: 1
matrix:
command: [integer, integer_multi_bit]
op_flavor: [default, unchecked]
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Run integer benchmarks
run: |
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
- name: Parse results
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "rtx4090" \
--backend gpu \
--project-version "${{ env.COMMIT_HASH }}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ always() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Integer RTX 4090 full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
cuda-core-crypto-benchmarks:
name: Cuda core crypto benchmarks (RTX 4090)
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
needs: cuda-integer-benchmarks
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}_cuda_core_crypto_bench
cancel-in-progress: true
runs-on: ["self-hosted", "4090-desktop"]
timeout-minutes: 1440 # 24 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Run integer benchmarks
run: |
make bench_pbs_gpu
make bench_ks_gpu
- name: Parse results
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "rtx4090" \
--backend gpu \
--project-version "${{ env.COMMIT_HASH }}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ !success() && !cancelled() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Core crypto RTX 4090 full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
remove_github_label:
name: Remove 4090 bench label
if: ${{ always() && github.event_name == 'pull_request' }}
needs: [cuda-integer-benchmarks, cuda-core-crypto-benchmarks]
runs-on: ubuntu-latest
steps:
- uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
with:
labels: 4090_bench
github_token: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -1,160 +0,0 @@
# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
name: TFHE Cuda Backend - Full tests on H100
env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
jobs:
setup-instance:
name: Setup instance (cuda-h100-tests)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: single-h100
cuda-tests-linux:
name: CUDA H100 tests
needs: [ setup-instance ]
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.1
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install ca-certificates curl
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: stable
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Run core crypto, integer and internal CUDA backend tests
run: |
make test_gpu
- name: Run user docs tests
run: |
make test_user_doc_gpu
- name: Test C API
run: |
make test_c_api_gpu
- name: Run High Level API Tests
run: |
make test_high_level_api_gpu
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
if: ${{ !success() && !cancelled() }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
SLACK_MESSAGE: "Integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-h100-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -26,7 +26,6 @@ env:
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-integer-benchmarks:
@@ -46,7 +45,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
@@ -56,7 +55,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
@@ -70,7 +69,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,17 +90,17 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
@@ -120,11 +119,11 @@ jobs:
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "Integer benchmarks failed. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -29,7 +29,6 @@ env:
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
prepare-matrix:
@@ -41,17 +40,17 @@ jobs:
- name: Weekly benchmarks
if: ${{ github.event.inputs.user_inputs == 'weekly_benchmarks' }}
run: |
echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
echo "OP_FLAVOR=[\"default\"]" >> ${GITHUB_ENV}
- name: Quarterly benchmarks
if: ${{ github.event.inputs.user_inputs == 'quarterly_benchmarks' }}
run: |
echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> "${GITHUB_ENV}"
echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> ${GITHUB_ENV}
- name: Set operation flavor output
id: set_op_flavor
run: |
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> ${GITHUB_OUTPUT}
integer-benchmarks:
name: Execute integer benchmarks for all operations flavor
@@ -74,17 +73,15 @@ jobs:
echo "Request ID: ${{ inputs.request_id }}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
@@ -92,16 +89,16 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Run benchmarks with AVX512
run: |
@@ -121,7 +118,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
@@ -148,11 +145,11 @@ jobs:
steps:
- name: Notify
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "Integer full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -1,11 +1,24 @@
# Run integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
# Run integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
name: Integer GPU benchmarks
on:
workflow_dispatch:
push:
branches:
- main
inputs:
instance_id:
description: "Instance ID"
type: string
instance_image_id:
description: "Instance AMI ID"
type: string
instance_type:
description: "Instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
env:
CARGO_TERM_COLOR: always
@@ -13,36 +26,12 @@ env:
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
jobs:
setup-instance:
name: Setup instance (cuda-integer-benchmarks)
runs-on: ubuntu-latest
if: github.event_name != 'push' ||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: single-h100
cuda-integer-benchmarks:
name: Execute GPU integer benchmarks
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
run-integer-benchmarks:
name: Execute integer benchmarks in EC2
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
@@ -50,65 +39,52 @@ jobs:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.1
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
- name: Instance configuration used
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
- name: Get benchmark date
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
{
echo "CUDA_PATH=$CUDA_PATH";
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
} >> "${GITHUB_ENV}"
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
} >> "${GITHUB_ENV}"
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Run benchmarks with AVX512
run: |
@@ -120,37 +96,39 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
- name: Parse results
run: |
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "n3-H100x1" \
--hardware ${{ inputs.instance_type }} \
--backend gpu \
--project-version "${{ env.COMMIT_HASH }}" \
--project-version "${COMMIT_HASH}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--commit-date "${COMMIT_DATE}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
@@ -166,39 +144,14 @@ jobs:
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-integer-benchmarks ]
runs-on: ubuntu-latest
if: ${{ !success() && !cancelled() }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ needs.cuda-integer-benchmarks.result }}
SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ needs.cuda-integer-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-integer-benchmarks)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-integer-benchmarks, slack-notify ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-integer-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Integer GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -1,88 +1,76 @@
# Run all integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
# Run all integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
name: Integer GPU full benchmarks
on:
workflow_dispatch:
schedule:
# Weekly benchmarks will be triggered each Saturday at 1a.m.
- cron: '0 1 * * 6'
inputs:
instance_id:
description: "Instance ID"
type: string
instance_image_id:
description: "Instance AMI ID"
type: string
instance_type:
description: "Instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
# This input is not used in this workflow but still mandatory since a calling workflow could
# use it. If a triggering command include a user_inputs field, then the triggered workflow
# must include this very input, otherwise the workflow won't be called.
# See start_full_benchmarks.yml as example.
user_inputs:
description: "Type of benchmarks to run"
type: string
default: "weekly_benchmarks"
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
jobs:
setup-instance:
name: Setup instance (cuda-integer-full-benchmarks)
runs-on: ubuntu-latest
if: github.event_name != 'schedule' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: single-h100
cuda-integer-full-benchmarks:
name: Execute GPU integer benchmarks for all operations flavor
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
timeout-minutes: 1440 # 24 hours
integer-benchmarks:
name: Execute integer benchmarks for all operations flavor
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
continue-on-error: true
strategy:
fail-fast: false
max-parallel: 1
matrix:
command: [integer, integer_multi_bit]
op_flavor: [default, unchecked]
command: [ integer, integer_multi_bit]
op_flavor: [ default, unchecked ]
# explicit include-based build matrix, of known valid options
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.1
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
- name: Instance configuration used
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
@@ -90,36 +78,33 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
{
echo "CUDA_PATH=$CUDA_PATH";
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
} >> "${GITHUB_ENV}"
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
} >> "${GITHUB_ENV}"
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Run benchmarks with AVX512
run: |
@@ -129,7 +114,7 @@ jobs:
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "n3-H100x1" \
--hardware ${{ inputs.instance_type }} \
--backend gpu \
--project-version "${{ env.COMMIT_HASH }}" \
--branch ${{ github.ref_name }} \
@@ -140,7 +125,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
@@ -159,39 +144,19 @@ jobs:
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
slack-notify:
slack-notification:
name: Slack Notification
needs: [ setup-instance, cuda-integer-full-benchmarks ]
runs-on: ubuntu-latest
if: ${{ !success() && !cancelled() }}
continue-on-error: true
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ failure() }}
needs: integer-benchmarks
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ needs.cuda-integer-full-benchmarks.result }}
SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ needs.cuda-integer-full-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-integer-full-benchmarks)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-integer-full-benchmarks, slack-notify ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
- name: Notify
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-integer-full-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Integer GPU full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -26,7 +26,6 @@ env:
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-integer-benchmarks:
@@ -46,7 +45,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
@@ -56,7 +55,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
@@ -70,7 +69,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,17 +90,17 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
@@ -120,11 +119,11 @@ jobs:
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "Integer benchmarks failed. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -1,11 +1,24 @@
# Run integer benchmarks with multi-bit cryptographic parameters on an instance and return parsed results to Slab CI bot.
name: Integer GPU Multi-bit benchmarks
# Run integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
name: Integer Multi-bit benchmarks
on:
workflow_dispatch:
schedule:
# Weekly benchmarks will be triggered each Saturday at 1a.m.
- cron: '0 1 * * 6'
inputs:
instance_id:
description: "Instance ID"
type: string
instance_image_id:
description: "Instance AMI ID"
type: string
instance_type:
description: "Instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
env:
CARGO_TERM_COLOR: always
@@ -13,103 +26,66 @@ env:
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
jobs:
setup-instance:
name: Setup instance (cuda-integer-multi-bit-benchmarks)
runs-on: ubuntu-latest
if: github.event_name != 'schedule' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: single-h100
cuda-integer-multi-bit-benchmarks:
name: Execute GPU integer multi-bit benchmarks
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
timeout-minutes: 1440 # 24 hours
run-integer-benchmarks:
name: Execute integer multi-bit benchmarks in EC2
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
cuda: "11.8"
cuda_arch: "70"
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.1
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
- name: Instance configuration used
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
- name: Get benchmark date
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
{
echo "CUDA_PATH=$CUDA_PATH";
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
} >> "${GITHUB_ENV}"
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
} >> "${GITHUB_ENV}"
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Run multi-bit benchmarks with AVX512
run: |
@@ -121,37 +97,39 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
- name: Parse results
run: |
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "n3-H100x1" \
--hardware ${{ inputs.instance_type }} \
--backend gpu \
--project-version "${{ env.COMMIT_HASH }}" \
--project-version "${COMMIT_HASH}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--commit-date "${COMMIT_DATE}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
@@ -167,40 +145,14 @@ jobs:
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-integer-multi-bit-benchmarks ]
runs-on: ubuntu-latest
if: ${{ !success() && !cancelled() }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ needs.cuda-integer-multi-bit-benchmarks.result }}
SLACK_MESSAGE: "Integer GPU multi-bit benchmarks finished with status: ${{ needs.cuda-integer-multi-bit-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-integer-full-benchmarks)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-integer-multi-bit-benchmarks, slack-notify ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-integer-multi-bit-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Integer GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -1,181 +0,0 @@
# Run 64-bit multi-bit integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
name: Integer multi GPU Multi-bit benchmarks
on:
workflow_dispatch:
schedule:
# Weekly benchmarks will be triggered each Saturday at 1a.m.
- cron: '0 1 * * 6'
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
jobs:
setup-instance:
name: Setup instance (cuda-integer-multi-bit-multi-gpu-benchmarks)
runs-on: ubuntu-latest
if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: multi-gpu-test
cuda-integer-multi-bit-multi-gpu-benchmarks:
name: Execute multi GPU integer multi-bit benchmarks
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
timeout-minutes: 1440 # 24 hours
continue-on-error: true
strategy:
fail-fast: false
max-parallel: 1
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
{
echo "CUDA_PATH=$CUDA_PATH";
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
} >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
} >> "${GITHUB_ENV}"
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Run multi-bit benchmarks with AVX512
run: |
make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu
- name: Parse results
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "p3.8xlarge" \
--backend gpu \
--project-version "${{ env.COMMIT_HASH }}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-integer-multi-bit-multi-gpu-benchmarks ]
runs-on: ubuntu-latest
if: ${{ !success() && !cancelled() }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result }}
SLACK_MESSAGE: "Integer multi GPU multi-bit benchmarks finished with status: ${{ needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-integer-multi-bit-multi-gpu-benchmarks)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-integer-multi-bit-multi-gpu-benchmarks ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-integer-multi-bit-multi-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -1,185 +0,0 @@
# Run all integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
name: Integer multi GPU full benchmarks
on:
workflow_dispatch:
schedule:
# Weekly benchmarks will be triggered each Saturday at 1a.m.
- cron: '0 1 * * 6'
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
jobs:
setup-instance:
name: Setup instance (cuda-integer-full-multi-gpu-benchmarks)
runs-on: ubuntu-latest
if: github.event_name != 'schedule' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: multi-gpu-test
cuda-integer-full-multi-gpu-benchmarks:
name: Execute multi GPU integer benchmarks for all operations flavor
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
timeout-minutes: 1440 # 24 hours
continue-on-error: true
strategy:
fail-fast: false
max-parallel: 1
matrix:
command: [integer, integer_multi_bit]
op_flavor: [default, unchecked]
# explicit include-based build matrix, of known valid options
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
{
echo "CUDA_PATH=$CUDA_PATH";
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
} >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
} >> "${GITHUB_ENV}"
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Run benchmarks with AVX512
run: |
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
- name: Parse results
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "p3.8xlarge" \
--backend gpu \
--project-version "${{ env.COMMIT_HASH }}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-integer-full-multi-gpu-benchmarks ]
runs-on: ubuntu-latest
if: ${{ !success() && !cancelled() }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ needs.cuda-integer-full-multi-gpu-benchmarks.result }}
SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ needs.cuda-integer-full-multi-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-integer-full-multi-gpu-benchmarks)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-integer-full-multi-gpu-benchmarks ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-integer-full-multi-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -3,7 +3,7 @@ name: Tests on M1 CPU
on:
workflow_dispatch:
pull_request:
types: [ labeled ]
types: [labeled]
# Have a nightly build for M1 tests
schedule:
# * is a special character in YAML so you have to quote this string
@@ -15,12 +15,8 @@ env:
CARGO_TERM_COLOR: always
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
FAST_TESTS: "TRUE"
# We clear the cache to reduce memory pressure because of the numerous processes of cargo
# nextest
TFHE_RS_CLEAR_IN_MEMORY_KEY_CACHE: "1"
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}
@@ -30,16 +26,12 @@ jobs:
cargo-builds:
if: ${{ (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'm1_test') }}
runs-on: ["self-hosted", "m1mac"]
# 12 hours, default is 6 hours, hopefully this is more than enough
timeout-minutes: 720
steps:
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
- name: Install latest stable
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
@@ -79,10 +71,6 @@ jobs:
run: |
make test_concrete_csprng
- name: Run tfhe-zk-pok tests
run: |
make test_zk_pok
- name: Run core tests
run: |
make test_core_crypto
@@ -91,13 +79,6 @@ jobs:
run: |
make test_boolean
# Because we do "illegal" things with the build system which Cargo does not seem to like much
# we need to clear the cache to make sure the C API is built properly and does not use a stale
# cached version
- name: Clear build cache
run: |
cargo clean
- name: Run C API tests
run: |
make test_c_api
@@ -149,7 +130,7 @@ jobs:
- name: Slack Notification
if: ${{ needs.cargo-builds.result != 'skipped' }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ needs.cargo-builds.result }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}

View File

@@ -20,14 +20,9 @@ on:
description: "Push node js package"
type: boolean
default: true
npm_latest_tag:
description: "Set NPM tag as latest"
type: boolean
default: false
env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
NPM_TAG: ""
jobs:
publish_release:
@@ -35,14 +30,10 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
- name: Create NPM version tag
run: |
echo "NPM_TAG=$(sed -n -e '1,/^version/p' tfhe/Cargo.toml | grep '^version[[:space:]]*=' | cut -d '=' -f 2 | tr -d ' ')" >> "${GITHUB_ENV}"
- name: Publish crate.io package
if: ${{ inputs.push_to_crates }}
env:
@@ -54,20 +45,11 @@ jobs:
- name: Build web package
if: ${{ inputs.push_web_package }}
run: |
make build_web_js_api_parallel
make build_web_js_api
- name: Publish web package
if: ${{ inputs.push_web_package }}
uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
with:
token: ${{ secrets.NPM_TOKEN }}
package: tfhe/pkg/package.json
dry-run: ${{ inputs.dry_run }}
tag: ${{ env.NPM_TAG }}
- name: Publish web package as latest
if: ${{ inputs.push_web_package && inputs.npm_latest_tag }}
uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
uses: JS-DevTools/npm-publish@4b07b26a2f6e0a51846e1870223e545bae91c552
with:
token: ${{ secrets.NPM_TOKEN }}
package: tfhe/pkg/package.json
@@ -83,16 +65,7 @@ jobs:
- name: Publish Node package
if: ${{ inputs.push_node_package }}
uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
with:
token: ${{ secrets.NPM_TOKEN }}
package: tfhe/pkg/package.json
dry-run: ${{ inputs.dry_run }}
tag: ${{ env.NPM_TAG }}
- name: Publish Node package as latest
if: ${{ inputs.push_node_package && inputs.npm_latest_tag }}
uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
uses: JS-DevTools/npm-publish@4b07b26a2f6e0a51846e1870223e545bae91c552
with:
token: ${{ secrets.NPM_TOKEN }}
package: tfhe/pkg/package.json
@@ -101,7 +74,7 @@ jobs:
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}

View File

@@ -18,7 +18,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
@@ -32,7 +32,7 @@ jobs:
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}

View File

@@ -1,129 +0,0 @@
# Publish new release of tfhe-cuda-backend on crates.io.
name: Publish CUDA release
on:
workflow_dispatch:
inputs:
dry_run:
description: "Dry-run"
type: boolean
default: true
push_to_crates:
description: "Push to crate"
type: boolean
default: true
env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
jobs:
setup-instance:
name: Setup instance (publish-cuda-release)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: gpu-test
publish-cuda-release:
name: Publish CUDA Release
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: stable
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
{
echo "CUDA_PATH=$CUDA_PATH";
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
} >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Publish crate.io package
if: ${{ inputs.push_to_crates }}
env:
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
run: |
cargo publish -p tfhe-cuda-backend --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "tfhe-cuda-backend release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (publish-release)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, publish-cuda-release ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (publish-cuda-release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -1,42 +0,0 @@
# Publish new release of tfhe-zk-pok on crates.io.
name: Publish tfhe-zk-pok release
on:
workflow_dispatch:
inputs:
dry_run:
description: "Dry-run"
type: boolean
default: true
env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
jobs:
publish_release:
name: Publish tfhe-zk-pok Release
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Publish crate.io package
env:
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
run: |
cargo publish -p tfhe-zk-pok --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "tfhe-zk-pok release failed: (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -17,14 +17,13 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
- name: Checkout lattice-estimator
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: malb/lattice-estimator
path: lattice_estimator
ref: '53508253629d3b5d31a2ad110e85dc69391ccb95'
- name: Install Sage
run: |
@@ -42,7 +41,7 @@ jobs:
- name: Slack Notification
if: ${{ always() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}

View File

@@ -25,7 +25,6 @@ env:
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-shortint-benchmarks:
@@ -45,7 +44,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
@@ -55,7 +54,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
@@ -89,17 +88,17 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_shortint
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
@@ -118,11 +117,11 @@ jobs:
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Shortint benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "Shortint benchmarks failed. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -33,7 +33,6 @@ env:
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
shortint-benchmarks:
@@ -53,17 +52,15 @@ jobs:
echo "Request ID: ${{ inputs.request_id }}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
@@ -71,16 +68,16 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Run benchmarks with AVX512
run: |
@@ -115,7 +112,7 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
@@ -142,11 +139,11 @@ jobs:
steps:
- name: Notify
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Shortint full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "Shortint full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -26,7 +26,6 @@ env:
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-integer-benchmarks:
@@ -46,7 +45,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
@@ -56,7 +55,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
@@ -70,7 +69,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,17 +90,17 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
@@ -120,11 +119,11 @@ jobs:
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Signed integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "Signed integer benchmarks failed. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -29,7 +29,6 @@ env:
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
integer-benchmarks:
@@ -52,17 +51,15 @@ jobs:
echo "Request ID: ${{ inputs.request_id }}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
@@ -70,16 +67,16 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Run benchmarks with AVX512
run: |
@@ -99,7 +96,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
@@ -126,11 +123,11 @@ jobs:
steps:
- name: Notify
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Signed integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "Signed integer full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -26,7 +26,6 @@ env:
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-integer-benchmarks:
@@ -46,7 +45,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
@@ -56,7 +55,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
@@ -70,7 +69,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,17 +90,17 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
@@ -120,11 +119,11 @@ jobs:
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Signed integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "Signed integer benchmarks failed. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -36,6 +36,14 @@ on:
description: "Run core crypto benches"
type: boolean
default: true
core_crypto_gpu_bench:
description: "Run core crypto benches on GPU"
type: boolean
default: true
wasm_client_bench:
description: "Run WASM client benches"
type: boolean
default: true
jobs:
start-benchmarks:
@@ -45,17 +53,18 @@ jobs:
command: [ boolean_bench, shortint_bench,
integer_bench, integer_multi_bit_bench,
signed_integer_bench, signed_integer_multi_bit_bench,
core_crypto_bench ]
integer_gpu_bench, integer_multi_bit_gpu_bench,
core_crypto_bench, core_crypto_gpu_bench, wasm_client_bench ]
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@d6babd6899969df1a11d14c368283ea4436bca78
uses: tj-actions/changed-files@ec75ae5ab7296b81fd4cddb77294d6718932ebab
with:
files_yaml: |
common_benches:
@@ -97,13 +106,16 @@ jobs:
- tfhe/src/core_crypto/**
- tfhe/benches/core_crypto/**
- .github/workflows/core_crypto_benchmark.yml
wasm_client_bench:
- tfhe/web_wasm_parallel_tests/**
- .github/workflows/wasm_client_benchmark.yml
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Start AWS job in Slab
# If manually triggered check that the current bench has been requested

View File

@@ -25,21 +25,21 @@ jobs:
strategy:
matrix:
command: [ boolean_bench, shortint_full_bench,
integer_full_bench, signed_integer_full_bench,
core_crypto_bench, wasm_client_bench ]
integer_full_bench, signed_integer_full_bench, integer_gpu_full_bench,
core_crypto_bench, core_crypto_gpu_bench, wasm_client_bench ]
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Set benchmarks type as weekly
if: (github.event_name == 'workflow_dispatch' && inputs.benchmark_type == 'weekly') || github.event.schedule == '0 1 * * 6'

View File

@@ -13,20 +13,25 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
- name: Save repo
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: repo-archive
path: '.'
- name: git-sync
uses: wei/git-sync@55c6b63b4f21607da0e9877ca9b4d11a29fc6d83
with:
source_repo: "zama-ai/tfhe-rs"
source_branch: "main"
destination_repo: "https://${{ secrets.BOT_USERNAME }}:${{ secrets.FHE_ACTIONS_TOKEN }}@github.com/${{ secrets.SYNC_DEST_REPO }}"
destination_repo: "https://${{ secrets.BOT_USERNAME }}:${{ secrets.CONCRETE_ACTIONS_TOKEN }}@github.com/${{ secrets.SYNC_DEST_REPO }}"
destination_branch: "main"
- name: git-sync tags
uses: wei/git-sync@55c6b63b4f21607da0e9877ca9b4d11a29fc6d83
with:
source_repo: "zama-ai/tfhe-rs"
source_branch: "refs/tags/*"
destination_repo: "https://${{ secrets.BOT_USERNAME }}:${{ secrets.FHE_ACTIONS_TOKEN }}@github.com/${{ secrets.SYNC_DEST_REPO }}"
destination_repo: "https://${{ secrets.BOT_USERNAME }}:${{ secrets.CONCRETE_ACTIONS_TOKEN }}@github.com/${{ secrets.SYNC_DEST_REPO }}"
destination_branch: "refs/tags/*"

View File

@@ -1,95 +1,60 @@
# Run WASM client benchmarks on an instance and return parsed results to Slab CI bot.
# Run WASM client benchmarks on an AWS instance and return parsed results to Slab CI bot.
name: WASM client benchmarks
on:
workflow_dispatch:
push:
branches:
- main
schedule:
# Weekly benchmarks will be triggered each Saturday at 1a.m.
- cron: '0 1 * * 6'
inputs:
instance_id:
description: "Instance ID"
type: string
instance_image_id:
description: "Instance AMI ID"
type: string
instance_type:
description: "Instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
# This input is not used in this workflow but still mandatory since a calling workflow could
# use it. If a triggering command include a user_inputs field, then the triggered workflow
# must include this very input, otherwise the workflow won't be called.
# See start_full_benchmarks.yml as example.
user_inputs:
description: "Type of benchmarks to run"
type: string
default: "weekly_benchmarks"
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
jobs:
should-run:
runs-on: ubuntu-latest
if: github.event_name == 'workflow_dispatch' ||
((github.event_name == 'push' || github.event_name == 'schedule') && github.repository == 'zama-ai/tfhe-rs')
permissions:
pull-requests: write
outputs:
wasm_bench: ${{ steps.changed-files.outputs.wasm_bench_any_changed }}
run-wasm-client-benchmarks:
name: Execute WASM client benchmarks in EC2
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@d6babd6899969df1a11d14c368283ea4436bca78
with:
since_last_remote_commit: true
files_yaml: |
wasm_bench:
- tfhe/Cargo.toml
- concrete-csprng/**
- tfhe-zk-pok/**
- tfhe/src/**
- '!tfhe/src/c_api/**'
- tfhe/web_wasm_parallel_tests/**
- .github/workflows/wasm_client_benchmark.yml
setup-instance:
name: Setup instance (wasm-client-benchmarks)
if: github.event_name != 'push' ||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.wasm_bench)
needs: should-run
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: cpu-small
wasm-client-benchmarks:
name: Execute WASM client benchmarks
needs: [ should-run, setup-instance ]
if: github.event_name != 'push' ||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.wasm_bench)
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Get benchmark details
- name: Instance configuration used
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
- name: Get benchmark date
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
@@ -97,7 +62,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
@@ -109,12 +74,15 @@ jobs:
- name: Parse results
run: |
make parse_wasm_benchmarks
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py tfhe/wasm_pk_gen.csv ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "m6i.4xlarge" \
--project-version "${{ env.COMMIT_HASH }}" \
--hardware ${{ inputs.instance_type }} \
--project-version "${COMMIT_HASH}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--commit-date "${COMMIT_DATE}" \
--bench-date "${{ env.BENCH_DATE }}" \
--key-gen
@@ -129,17 +97,17 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_wasm
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
@@ -158,31 +126,11 @@ jobs:
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "WASM benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (wasm-client-benchmarks)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, wasm-client-benchmarks ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (wasm-client-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "WASM benchmarks failed. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

3
.gitignore vendored
View File

@@ -19,6 +19,3 @@ dieharder_run.log
# Coverage reports
/coverage/
# Cuda local build
backends/tfhe-cuda-backend/cuda/cmake-build-debug/

View File

@@ -1,13 +1,6 @@
[workspace]
resolver = "2"
members = [
"tfhe",
"tfhe-zk-pok",
"tasks",
"apps/trivium",
"concrete-csprng",
"backends/tfhe-cuda-backend",
]
members = ["tfhe", "tasks", "apps/trivium", "concrete-csprng", "backends/tfhe-cuda-backend"]
[profile.bench]
lto = "fat"
@@ -24,4 +17,3 @@ lto = "off"
inherits = "dev"
opt-level = 3
lto = "off"
debug-assertions = false

226
Makefile
View File

@@ -3,7 +3,6 @@ OS:=$(shell uname)
RS_CHECK_TOOLCHAIN:=$(shell cat toolchain.txt | tr -d '\n')
CARGO_RS_CHECK_TOOLCHAIN:=+$(RS_CHECK_TOOLCHAIN)
TARGET_ARCH_FEATURE:=$(shell ./scripts/get_arch_feature.sh)
CPU_COUNT=$(shell ./scripts/cpu_count.sh)
RS_BUILD_TOOLCHAIN:=stable
CARGO_RS_BUILD_TOOLCHAIN:=+$(RS_BUILD_TOOLCHAIN)
CARGO_PROFILE?=release
@@ -62,7 +61,7 @@ REGEX_STRING?=''
REGEX_PATTERN?=''
# tfhe-cuda-backend
TFHECUDA_SRC=backends/tfhe-cuda-backend/cuda
TFHECUDA_SRC="backends/tfhe-cuda-backend/cuda"
TFHECUDA_BUILD=$(TFHECUDA_SRC)/build
# Exclude these files from coverage reports
@@ -120,12 +119,7 @@ install_wasm_pack: install_rs_build_toolchain
.PHONY: install_node # Install last version of NodeJS via nvm
install_node:
curl -o nvm_install.sh https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.3/install.sh
@echo "2ed5e94ba12434370f0358800deb69f514e8bce90f13beb0e1b241d42c6abafd nvm_install.sh" > nvm_checksum
@sha256sum -c nvm_checksum
@rm nvm_checksum
$(SHELL) nvm_install.sh
@rm nvm_install.sh
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.3/install.sh | $(SHELL)
source ~/.bashrc
$(SHELL) -i -c 'nvm install $(NODE_VERSION)' || \
( echo "Unable to install node, unknown error." && exit 1 )
@@ -150,67 +144,28 @@ check_linelint_installed:
@printf "\n" | linelint - > /dev/null 2>&1 || \
( echo "Unable to locate linelint. Try installing it: https://github.com/fernandrone/linelint/releases" && exit 1 )
.PHONY: check_actionlint_installed # Check if actionlint workflow linter is installed
check_actionlint_installed:
@actionlint --version > /dev/null 2>&1 || \
( echo "Unable to locate actionlint. Try installing it: https://github.com/rhysd/actionlint/releases" && exit 1 )
.PHONY: check_nvm_installed # Check if Node Version Manager is installed
check_nvm_installed:
@source ~/.nvm/nvm.sh && nvm --version > /dev/null 2>&1 || \
( echo "Unable to locate Node. Run 'make install_node'" && exit 1 )
.PHONY: install_mlc # Install mlc (Markup Link Checker)
install_mlc: install_rs_build_toolchain
@mlc --version > /dev/null 2>&1 || \
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install mlc --locked || \
( echo "Unable to install mlc, unknown error." && exit 1 )
.PHONY: fmt # Format rust code
fmt: install_rs_check_toolchain
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
.PHONY: fmt_js # Format javascript code
fmt_js: check_nvm_installed
source ~/.nvm/nvm.sh && \
nvm install $(NODE_VERSION) && \
nvm use $(NODE_VERSION) && \
$(MAKE) -C tfhe/web_wasm_parallel_tests fmt
.PHONY: fmt_gpu # Format rust and cuda code
fmt_gpu: install_rs_check_toolchain
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh
.PHONY: fmt_c_tests # Format c tests
fmt_c_tests:
find tfhe/c_api_tests/ -regex '.*\.\(cpp\|hpp\|cu\|c\|h\)' -exec clang-format -style=file -i {} \;
.PHONY: check_fmt # Check rust code format
check_fmt: install_rs_check_toolchain
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
.PHONY: check_fmt_c_tests # Check C tests format
check_fmt_c_tests:
find tfhe/c_api_tests/ -regex '.*\.\(cpp\|hpp\|cu\|c\|h\)' -exec clang-format --dry-run --Werror -style=file {} \;
.PHONY: check_fmt_gpu # Check rust and cuda code format
check_fmt_gpu: install_rs_check_toolchain
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh -c
.PHONY: check_fmt_js # Check javascript code format
check_fmt_js: check_nvm_installed
source ~/.nvm/nvm.sh && \
nvm install $(NODE_VERSION) && \
nvm use $(NODE_VERSION) && \
$(MAKE) -C tfhe/web_wasm_parallel_tests check_fmt
.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
clippy_gpu: install_rs_check_toolchain
clippy_gpu: install_rs_check_toolchain clippy_cuda_backend
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu \
--all-targets \
--features=$(TARGET_ARCH_FEATURE),integer,shortint,gpu \
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
@@ -221,10 +176,6 @@ fix_newline: check_linelint_installed
check_newline: check_linelint_installed
linelint .
.PHONY: lint_workflow # Run static linter on GitHub workflows
lint_workflow: check_actionlint_installed
actionlint
.PHONY: clippy_core # Run clippy lints on core_crypto with and without experimental features
clippy_core: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
@@ -267,7 +218,7 @@ clippy: install_rs_check_toolchain
.PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
clippy_c_api: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api \
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
@@ -277,35 +228,30 @@ clippy_js_wasm_api: install_rs_check_toolchain
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: clippy_tasks # Run clippy lints on helper tasks crate.
clippy_tasks: install_rs_check_toolchain
clippy_tasks:
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-p tasks -- --no-deps -D warnings
.PHONY: clippy_trivium # Run clippy lints on Trivium app
clippy_trivium: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-p tfhe-trivium -- --no-deps -D warnings
.PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
clippy_all_targets: install_rs_check_toolchain
clippy_all_targets:
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok-experimental \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache \
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
clippy_concrete_csprng: install_rs_check_toolchain
clippy_concrete_csprng:
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
--features=$(TARGET_ARCH_FEATURE) \
-p concrete-csprng -- --no-deps -D warnings
.PHONY: clippy_zk_pok # Run clippy lints on tfhe-zk-pok
clippy_zk_pok: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-zk-pok -- --no-deps -D warnings
.PHONY: clippy_all # Run all clippy targets
clippy_all: clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets clippy_c_api \
clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium
clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_trivium
.PHONY: clippy_fast # Run main clippy targets
clippy_fast: clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core \
@@ -313,7 +259,7 @@ clippy_concrete_csprng
.PHONY: clippy_cuda_backend # Run clippy lints on the tfhe-cuda-backend
clippy_cuda_backend: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-p tfhe-cuda-backend -- --no-deps -D warnings
.PHONY: build_core # Build core_crypto without experimental features
@@ -354,11 +300,6 @@ build_tfhe_full: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p $(TFHE_SPEC) --all-targets
.PHONY: build_tfhe_coverage # Build with test coverage enabled
build_tfhe_coverage: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) --tests
.PHONY: symlink_c_libs_without_fingerprint # Link the .a and .so files without the changing hash part in target
symlink_c_libs_without_fingerprint:
@./scripts/symlink_c_libs_without_fingerprint.sh \
@@ -368,21 +309,21 @@ symlink_c_libs_without_fingerprint:
.PHONY: build_c_api # Build the C API for boolean, shortint and integer
build_c_api: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,$(FORWARD_COMPAT_FEATURE) \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,$(FORWARD_COMPAT_FEATURE) \
-p $(TFHE_SPEC)
@"$(MAKE)" symlink_c_libs_without_fingerprint
.PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
build_c_api_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,gpu \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,gpu \
-p $(TFHE_SPEC)
@"$(MAKE)" symlink_c_libs_without_fingerprint
.PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
-p $(TFHE_SPEC)
@"$(MAKE)" symlink_c_libs_without_fingerprint
@@ -391,7 +332,7 @@ build_web_js_api: install_rs_build_toolchain install_wasm_pack
cd tfhe && \
RUSTFLAGS="$(WASM_RUSTFLAGS)" rustup run "$(RS_BUILD_TOOLCHAIN)" \
wasm-pack build --release --target=web \
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok-experimental
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api
.PHONY: build_web_js_api_parallel # Build the js API targeting the web browser with parallelism support
build_web_js_api_parallel: install_rs_check_toolchain install_wasm_pack
@@ -399,7 +340,7 @@ build_web_js_api_parallel: install_rs_check_toolchain install_wasm_pack
rustup component add rust-src --toolchain $(RS_CHECK_TOOLCHAIN) && \
RUSTFLAGS="$(WASM_RUSTFLAGS) -C target-feature=+atomics,+bulk-memory,+mutable-globals" rustup run $(RS_CHECK_TOOLCHAIN) \
wasm-pack build --release --target=web \
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,parallel-wasm-api,zk-pok-experimental \
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,parallel-wasm-api \
-Z build-std=panic_abort,std
.PHONY: build_node_js_api # Build the js API targeting nodejs
@@ -407,20 +348,29 @@ build_node_js_api: install_rs_build_toolchain install_wasm_pack
cd tfhe && \
RUSTFLAGS="$(WASM_RUSTFLAGS)" rustup run "$(RS_BUILD_TOOLCHAIN)" \
wasm-pack build --release --target=nodejs \
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok-experimental
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api
.PHONY: build_concrete_csprng # Build concrete_csprng
build_concrete_csprng: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng --all-targets
#.PHONY: test_core_crypto # Run the tests of the core_crypto module including experimental ones
#test_core_crypto: install_rs_build_toolchain install_rs_check_toolchain
# RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
# --features=$(TARGET_ARCH_FEATURE),experimental -p $(TFHE_SPEC) -- core_crypto::
# @if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
# RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
# --features=$(TARGET_ARCH_FEATURE),experimental,$(AVX512_FEATURE) -p $(TFHE_SPEC) -- core_crypto::; \
# fi
.PHONY: test_core_crypto # Run the tests of the core_crypto module including experimental ones
test_core_crypto: install_rs_build_toolchain install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok-experimental -p $(TFHE_SPEC) -- core_crypto::
--features=$(TARGET_ARCH_FEATURE),experimental -p $(TFHE_SPEC) -- glwe_encrypt_tensor_prod_decrypt
@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok-experimental,$(AVX512_FEATURE) -p $(TFHE_SPEC) -- core_crypto::; \
--features=$(TARGET_ARCH_FEATURE),experimental,$(AVX512_FEATURE) -p $(TFHE_SPEC) -- core_crypto::; \
fi
.PHONY: test_core_crypto_cov # Run the tests of the core_crypto module with code coverage
@@ -428,38 +378,30 @@ test_core_crypto_cov: install_rs_build_toolchain install_rs_check_toolchain inst
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
--out xml --output-dir coverage/core_crypto --line --engine llvm --timeout 500 \
--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache \
--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,__coverage \
-p $(TFHE_SPEC) -- core_crypto::
@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
--out xml --output-dir coverage/core_crypto_avx512 --line --engine llvm --timeout 500 \
--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,$(AVX512_FEATURE) \
-p $(TFHE_SPEC) -- -Z unstable-options --report-time core_crypto::; \
--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,__coverage,$(AVX512_FEATURE) \
-p $(TFHE_SPEC) -- core_crypto::; \
fi
.PHONY: test_cuda_backend # Run the internal tests of the CUDA backend
test_cuda_backend:
mkdir -p "$(TFHECUDA_BUILD)" && \
cd "$(TFHECUDA_BUILD)" && \
cmake .. -DCMAKE_BUILD_TYPE=Release -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON && \
make -j "$(CPU_COUNT)" && \
make test
.PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
test_gpu: test_core_crypto_gpu test_integer_gpu
.PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
test_core_crypto_gpu: install_rs_build_toolchain
test_core_crypto_gpu: install_rs_build_toolchain install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
.PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
test_integer_gpu: install_rs_build_toolchain
test_integer_gpu: install_rs_build_toolchain install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=6
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
@@ -473,8 +415,8 @@ test_boolean_cov: install_rs_check_toolchain install_tarpaulin
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
--out xml --output-dir coverage/boolean --line --engine llvm --timeout 500 \
$(COVERAGE_EXCLUDED_FILES) \
--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache \
-p $(TFHE_SPEC) -- -Z unstable-options --report-time boolean::
--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,__coverage \
-p $(TFHE_SPEC) -- boolean::
.PHONY: test_c_api_rs # Run the rust tests for the C API
test_c_api_rs: install_rs_check_toolchain
@@ -485,14 +427,14 @@ test_c_api_rs: install_rs_check_toolchain
.PHONY: test_c_api_c # Run the C tests for the C API
test_c_api_c: build_c_api
./scripts/c_api_tests.sh --cargo-profile "$(CARGO_PROFILE)"
./scripts/c_api_tests.sh
.PHONY: test_c_api # Run all the tests for the C API
test_c_api: test_c_api_rs test_c_api_c
.PHONY: test_c_api_gpu # Run the C tests for the C API
test_c_api_gpu: build_c_api_gpu
./scripts/c_api_tests.sh --gpu --cargo-profile "$(CARGO_PROFILE)"
./scripts/c_api_tests.sh --gpu
.PHONY: test_shortint_ci # Run the tests for shortint ci
test_shortint_ci: install_rs_build_toolchain install_cargo_nextest
@@ -518,8 +460,8 @@ test_shortint_cov: install_rs_check_toolchain install_tarpaulin
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
--out xml --output-dir coverage/shortint --line --engine llvm --timeout 500 \
$(COVERAGE_EXCLUDED_FILES) \
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache \
-p $(TFHE_SPEC) -- -Z unstable-options --report-time shortint::
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,__coverage \
-p $(TFHE_SPEC) -- shortint::
.PHONY: test_integer_ci # Run the tests for integer ci
test_integer_ci: install_rs_check_toolchain install_cargo_nextest
@@ -579,37 +521,22 @@ test_integer: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache -p $(TFHE_SPEC) -- integer::
.PHONY: test_integer_cov # Run the tests of the integer module with code coverage
test_integer_cov: install_rs_check_toolchain install_tarpaulin
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
--out xml --output-dir coverage/integer --line --engine llvm --timeout 500 \
--implicit-test-threads \
--exclude-files $(COVERAGE_EXCLUDED_FILES) \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache \
-p $(TFHE_SPEC) -- -Z unstable-options --report-time integer::
.PHONY: test_high_level_api # Run all the tests for high_level_api
test_high_level_api: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok-experimental -p $(TFHE_SPEC) \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) \
-- high_level_api::
test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) \
-E "test(/high_level_api::.*gpu.*/)"
.PHONY: test_user_doc # Run tests from the .md documentation
test_user_doc: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok-experimental \
-p $(TFHE_SPEC) \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) \
-- test_user_docs::
.PHONY: test_user_doc_gpu # Run tests for GPU from the .md documentation
test_user_doc_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu,zk-pok-experimental -p $(TFHE_SPEC) \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu -p $(TFHE_SPEC) \
-- test_user_docs::
.PHONY: test_fhe_strings # Run tests for fhe_strings example
@@ -644,79 +571,48 @@ test_kreyvium: install_rs_build_toolchain
-p tfhe-trivium -- --test-threads=1 kreyvium::
.PHONY: test_concrete_csprng # Run concrete-csprng tests
test_concrete_csprng: install_rs_build_toolchain
test_concrete_csprng:
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng
.PHONY: test_zk_pok # Run tfhe-zk-pok-experimental tests
test_zk_pok: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-p tfhe-zk-pok
.PHONY: doc # Build rust doc
doc: install_rs_check_toolchain
@# Even though we are not in docs.rs, this allows to "just" build the doc
DOCS_RS=1 \
RUSTDOCFLAGS="--html-in-header katex-header.html" \
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,gpu,internal-keycache,experimental --no-deps -p $(TFHE_SPEC)
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer --no-deps -p $(TFHE_SPEC)
.PHONY: docs # Build rust doc alias for doc
docs: doc
.PHONY: lint_doc # Build rust doc with linting enabled
lint_doc: install_rs_check_toolchain
@# Even though we are not in docs.rs, this allows to "just" build the doc
DOCS_RS=1 \
RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,gpu,internal-keycache,experimental -p $(TFHE_SPEC) --no-deps
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p $(TFHE_SPEC) --no-deps
.PHONY: lint_docs # Build rust doc with linting enabled alias for lint_doc
lint_docs: lint_doc
.PHONY: format_doc_latex # Format the documentation latex equations to avoid broken rendering.
format_doc_latex:
RUSTFLAGS="" cargo xtask format_latex_doc
cargo xtask format_latex_doc
@"$(MAKE)" --no-print-directory fmt
@printf "\n===============================\n\n"
@printf "Please manually inspect changes made by format_latex_doc, rustfmt can break equations \
if the line length is exceeded\n"
@printf "\n===============================\n"
.PHONY: check_md_docs_are_tested # Checks that the rust codeblocks in our .md files are tested
check_md_docs_are_tested:
RUSTFLAGS="" cargo xtask check_tfhe_docs_are_tested
.PHONY: check_intra_md_links # Checks broken internal links in Markdown docs
check_intra_md_links: install_mlc
mlc --offline --match-file-extension tfhe/docs
.PHONY: check_md_links # Checks all broken links in Markdown docs
check_md_links: install_mlc
mlc --match-file-extension tfhe/docs
.PHONY: check_compile_tests # Build tests in debug without running them
check_compile_tests: install_rs_build_toolchain
check_compile_tests:
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache \
-p $(TFHE_SPEC)
@if [[ "$(OS)" == "Linux" || "$(OS)" == "Darwin" ]]; then \
"$(MAKE)" build_c_api && \
./scripts/c_api_tests.sh --build-only --cargo-profile "$(CARGO_PROFILE)"; \
./scripts/c_api_tests.sh --build-only; \
fi
.PHONY: check_compile_tests_benches_gpu # Build tests in debug without running them
check_compile_tests_benches_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache,gpu \
-p $(TFHE_SPEC)
mkdir -p "$(TFHECUDA_BUILD)" && \
cd "$(TFHECUDA_BUILD)" && \
cmake .. -DCMAKE_BUILD_TYPE=Debug -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON -DTFHE_CUDA_BACKEND_BUILD_BENCHMARKS=ON && \
make -j "$(CPU_COUNT)"
.PHONY: build_nodejs_test_docker # Build a docker image with tools to run nodejs tests for wasm API
build_nodejs_test_docker:
DOCKER_BUILDKIT=1 docker build --build-arg RUST_TOOLCHAIN="$(RS_BUILD_TOOLCHAIN)" \
@@ -827,6 +723,8 @@ bench_oprf: install_rs_check_toolchain
--bench oprf-integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
.PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
bench_shortint_multi_bit: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
@@ -835,6 +733,7 @@ bench_shortint_multi_bit: install_rs_check_toolchain
--bench shortint-bench \
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
.PHONY: bench_boolean # Run benchmarks for boolean
bench_boolean: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
@@ -847,12 +746,6 @@ bench_pbs: install_rs_check_toolchain
--bench pbs-bench \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
.PHONY: bench_pbs128 # Run benchmarks for PBS using FFT 128 bits
bench_pbs128: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench pbs128-bench \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
.PHONY: bench_pbs_gpu # Run benchmarks for PBS on GPU backend
bench_pbs_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
@@ -884,10 +777,9 @@ ci_bench_web_js_api_parallel: build_web_js_api_parallel
#
# Utility tools
#
.PHONY: gen_key_cache # Run the script to generate keys and cache them for shortint tests
gen_key_cache: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
--example generates_test_keys \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache -- \
$(MULTI_BIT_ONLY) $(COVERAGE_ONLY)
@@ -960,15 +852,13 @@ sha256_bool: install_rs_check_toolchain
--features=$(TARGET_ARCH_FEATURE),boolean
.PHONY: pcc # pcc stands for pre commit checks (except GPU)
pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested check_intra_md_links \
clippy_all check_compile_tests
pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_all check_compile_tests
.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu
pcc_gpu: pcc clippy_gpu
.PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
fpcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested clippy_fast \
check_compile_tests
fpcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_fast check_compile_tests
.PHONY: conformance # Automatically fix problems that can be fixed
conformance: fix_newline fmt

View File

@@ -1,10 +1,6 @@
<p align="center">
<!-- product name logo -->
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/5283e0ba-da1e-43af-9f2a-c5221367a12b">
<source media="(prefers-color-scheme: light)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/b94a8c96-7595-400b-9311-70765c706955">
<img width=600 alt="Zama TFHE-rs">
</picture>
<img width=600 src="https://user-images.githubusercontent.com/5758427/231206749-8f146b97-3c5a-4201-8388-3ffa88580415.png">
</p>
<hr/>
@@ -50,7 +46,7 @@ production-ready library for all the advanced features of TFHE.
<br></br>
## Table of Contents
- **[Getting started](#getting-started)**
- **[Getting Started](#getting-started)**
- [Cargo.toml configuration](#cargotoml-configuration)
- [A simple example](#a-simple-example)
- **[Resources](#resources)**
@@ -65,7 +61,7 @@ production-ready library for all the advanced features of TFHE.
- **[Support](#support)**
<br></br>
## Getting started
## Getting Started
### Cargo.toml configuration
To use the latest version of `TFHE-rs` in your project, you first need to add it as a dependency in your `Cargo.toml`:
@@ -89,7 +85,7 @@ tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64"]
```
> [!Note]
> Note: You need to use a Rust version >= 1.73 to compile TFHE-rs.
> Note: You need to use a Rust version >= 1.72 to compile TFHE-rs.
> [!Note]
> Note: aarch64-based machines are not yet supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.
@@ -131,13 +127,13 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
// Clear equivalent computations: 1344 * 5 = 6720
let encrypted_res_mul = &encrypted_a * &encrypted_b;
// Clear equivalent computations: 6720 >> 5 = 210
// Clear equivalent computations: 1344 >> 5 = 42
encrypted_a = &encrypted_res_mul >> &encrypted_b;
// Clear equivalent computations: let casted_a = a as u8;
let casted_a: FheUint8 = encrypted_a.cast_into();
// Clear equivalent computations: min(210, 7) = 7
// Clear equivalent computations: min(42, 7) = 7
let encrypted_res_min = &casted_a.min(&encrypted_c);
// Operation between clear and encrypted data:
@@ -177,12 +173,12 @@ to run in release mode with cargo's `--release` flag to have the best performanc
<br></br>
### Tutorials
- [[Video tutorial] Implement signed integers using TFHE-rs ](https://www.zama.ai/post/video-tutorial-implement-signed-integers-ssing-tfhe-rs)
- [Homomorphic parity bit](https://docs.zama.ai/tfhe-rs/tutorials/parity_bit)
- [Homomorphic case changing on Ascii string](https://docs.zama.ai/tfhe-rs/tutorials/ascii_fhe_string)
- [Homomorphic Parity Bit](https://docs.zama.ai/tfhe-rs/tutorials/parity_bit)
- [Homomorphic Case Changing on Ascii String](https://docs.zama.ai/tfhe-rs/tutorials/ascii_fhe_string)
- [Boolean SHA256 with TFHE-rs](https://www.zama.ai/post/boolean-sha256-tfhe-rs)
- [Dark market with TFHE-rs](https://www.zama.ai/post/dark-market-tfhe-rs)
- [Regular expression engine with TFHE-rs](https://www.zama.ai/post/regex-engine-tfhe-rs)
- [Dark Market with TFHE-rs](https://www.zama.ai/post/dark-market-tfhe-rs)
- [Regular Expression Engine with TFHE-rs](https://www.zama.ai/post/regex-engine-tfhe-rs)
*Explore more useful resources in [TFHE-rs tutorials](https://docs.zama.ai/tfhe-rs/tutorials) and [Awesome Zama repo](https://github.com/zama-ai/awesome-zama)*
<br></br>
@@ -198,7 +194,7 @@ Full, comprehensive documentation is available here: [https://docs.zama.ai/tfhe-
### Disclaimers
#### Security estimation
#### Security Estimation
Security estimations are done using the
[Lattice Estimator](https://github.com/malb/lattice-estimator)
@@ -206,13 +202,7 @@ with `red_cost_model = reduction.RC.BDGL16`.
When a new update is published in the Lattice Estimator, we update parameters accordingly.
### Security model
The default parameters for the TFHE-rs library are chosen considering the IND-CPA security model, and are selected with a bootstrapping failure probability fixed at p_error = $2^{-40}$. In particular, it is assumed that the results of decrypted computations are not shared by the secret key owner with any third parties, as such an action can lead to leakage of the secret encryption key. If you are designing an application where decryptions must be shared, you will need to craft custom encryption parameters which are chosen in consideration of the IND-CPA^D security model [1].
[1] Li, Baiyu, et al. "Securing approximate homomorphic encryption using differential privacy." Annual International Cryptology Conference. Cham: Springer Nature Switzerland, 2022. https://eprint.iacr.org/2022/816.pdf
#### Side-channel attacks
#### Side-Channel Attacks
Mitigation for side-channel attacks has not yet been implemented in TFHE-rs,
and will be released in upcoming versions.
@@ -241,23 +231,7 @@ Becoming an approved contributor involves signing our Contributor License Agreem
<br></br>
### License
This software is distributed under the **BSD-3-Clause-Clear** license. Read [this](LICENSE) for more details.
#### FAQ
**Is Zamas technology free to use?**
>Zamas libraries are free to use under the BSD 3-Clause Clear license only for development, research, prototyping, and experimentation purposes. However, for any commercial use of Zama's open source code, companies must purchase Zamas commercial patent license.
>
>Everything we do is open source and we are very transparent on what it means for our users, you can read more about how we monetize our open source products at Zama in [this blogpost](https://www.zama.ai/post/open-source).
**What do I need to do if I want to use Zamas technology for commercial purposes?**
>To commercially use Zamas technology you need to be granted Zamas patent license. Please contact us hello@zama.ai for more information.
**Do you file IP on your technology?**
>Yes, all Zamas technologies are patented.
**Can you customize a solution for my specific use case?**
>We are open to collaborating and advancing the FHE space with our partners. If you have specific needs, please email us at hello@zama.ai.
This software is distributed under the **BSD-3-Clause-Clear** license. If you have any questions, please contact us at hello@zama.ai.
<p align="right">
<a href="#about" > ↑ Back to top </a>
</p>
@@ -266,11 +240,7 @@ This software is distributed under the **BSD-3-Clause-Clear** license. Read [thi
## Support
<a target="_blank" href="https://community.zama.ai">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/08656d0a-3f44-4126-b8b6-8c601dff5380">
<source media="(prefers-color-scheme: light)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/1c9c9308-50ac-4aab-a4b9-469bb8c536a4">
<img alt="Support">
</picture>
<img src="https://github.com/zama-ai/tfhe-rs/assets/157474013/8da6cf5b-51a0-4c86-9e75-fd0e4a4c64a4">
</a>
🌟 If you find this project helpful or interesting, please consider giving it a star on GitHub! Your support helps to grow the community and motivates further development.

View File

@@ -15,6 +15,7 @@ Example of a Rust main below:
```rust
use tfhe::{ConfigBuilder, generate_keys, FheBool};
use tfhe::prelude::*;
use tfhe_trivium::TriviumStream;
fn get_hexadecimal_string_from_lsb_first_stream(a: Vec<bool>) -> String {
@@ -71,7 +72,7 @@ fn get_hexadecimal_string_from_lsb_first_stream(a: Vec<bool>) -> String {
}
fn main() {
let config = ConfigBuilder::default().build();
let config = ConfigBuilder::all_disabled().enable_default_bool().build();
let (client_key, server_key) = generate_keys(config);
let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -138,12 +139,14 @@ Example code:
```rust
use tfhe::shortint::prelude::*;
use tfhe::shortint::CastingKey;
use tfhe::{ConfigBuilder, generate_keys, FheUint64};
use tfhe::prelude::*;
use tfhe_trivium::TriviumStreamShortint;
fn test_shortint() {
let config = ConfigBuilder::default().build();
let config = ConfigBuilder::all_disabled().enable_default_integers().build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);
let ksk = CastingKey::new((&client_key, &server_key), (&hl_client_key, &hl_server_key));

View File

@@ -1,8 +1,10 @@
use criterion::Criterion;
use tfhe::prelude::*;
use tfhe::{generate_keys, ConfigBuilder, FheBool};
use tfhe_trivium::KreyviumStream;
use criterion::Criterion;
pub fn kreyvium_bool_gen(c: &mut Criterion) {
let config = ConfigBuilder::default().build();
let (client_key, server_key) = generate_keys(config);

View File

@@ -1,8 +1,10 @@
use criterion::Criterion;
use tfhe::prelude::*;
use tfhe::{generate_keys, ConfigBuilder, FheUint64, FheUint8};
use tfhe_trivium::{KreyviumStreamByte, TransCiphering};
use criterion::Criterion;
pub fn kreyvium_byte_gen(c: &mut Criterion) {
let config = ConfigBuilder::default()
.enable_function_evaluation()

View File

@@ -1,9 +1,12 @@
use criterion::Criterion;
use tfhe::prelude::*;
use tfhe::shortint::prelude::*;
use tfhe::shortint::KeySwitchingKey;
use tfhe::{generate_keys, ConfigBuilder, FheUint64};
use tfhe_trivium::{KreyviumStreamShortint, TransCiphering};
use criterion::Criterion;
pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
let config = ConfigBuilder::default().build();
let (hl_client_key, hl_server_key) = generate_keys(config);

View File

@@ -1,8 +1,10 @@
use criterion::Criterion;
use tfhe::prelude::*;
use tfhe::{generate_keys, ConfigBuilder, FheBool};
use tfhe_trivium::TriviumStream;
use criterion::Criterion;
pub fn trivium_bool_gen(c: &mut Criterion) {
let config = ConfigBuilder::default().build();
let (client_key, server_key) = generate_keys(config);

View File

@@ -1,8 +1,10 @@
use criterion::Criterion;
use tfhe::prelude::*;
use tfhe::{generate_keys, ConfigBuilder, FheUint64, FheUint8};
use tfhe_trivium::{TransCiphering, TriviumStreamByte};
use criterion::Criterion;
pub fn trivium_byte_gen(c: &mut Criterion) {
let config = ConfigBuilder::default().build();
let (client_key, server_key) = generate_keys(config);

View File

@@ -1,9 +1,12 @@
use criterion::Criterion;
use tfhe::prelude::*;
use tfhe::shortint::prelude::*;
use tfhe::shortint::KeySwitchingKey;
use tfhe::{generate_keys, ConfigBuilder, FheUint64};
use tfhe_trivium::{TransCiphering, TriviumStreamShortint};
use criterion::Criterion;
pub fn trivium_shortint_warmup(c: &mut Criterion) {
let config = ConfigBuilder::default().build();
let (hl_client_key, hl_server_key) = generate_keys(config);

View File

@@ -2,10 +2,12 @@
//! for the representation of the inner bits.
use crate::static_deque::StaticDeque;
use rayon::prelude::*;
use tfhe::prelude::*;
use tfhe::{set_server_key, unset_server_key, FheBool, ServerKey};
use rayon::prelude::*;
/// Internal trait specifying which operations are necessary for KreyviumStream generic type
pub trait KreyviumBoolInput<OpOutput>:
Sized

View File

@@ -2,10 +2,12 @@
//! for the representation of the inner bits.
use crate::static_deque::{StaticByteDeque, StaticByteDequeInput};
use rayon::prelude::*;
use tfhe::prelude::*;
use tfhe::{set_server_key, unset_server_key, FheUint8, ServerKey};
use rayon::prelude::*;
/// Internal trait specifying which operations are necessary for KreyviumStreamByte generic type
pub trait KreyviumByteInput<OpOutput>:
Sized

View File

@@ -1,7 +1,9 @@
use crate::static_deque::StaticDeque;
use rayon::prelude::*;
use tfhe::shortint::prelude::*;
use rayon::prelude::*;
/// KreyviumStreamShortint: a struct implementing the Kreyvium stream cipher, using a generic
/// Ciphertext for the internal representation of bits (intended to represent a single bit). To be
/// able to compute FHE operations, it also owns a ServerKey.
@@ -34,7 +36,7 @@ impl KreyviumStreamShortint {
let mut c_register: [Ciphertext; 111] = [0; 111].map(|x| sk.create_trivial(x));
for i in 0..93 {
a_register[i].clone_from(&key[128 - 93 + i]);
a_register[i] = key[128 - 93 + i].clone();
}
for i in 0..84 {
b_register[i] = sk.create_trivial(iv[128 - 84 + i]);

View File

@@ -1,7 +1,8 @@
use crate::{KreyviumStream, KreyviumStreamByte, KreyviumStreamShortint, TransCiphering};
use tfhe::prelude::*;
use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
use crate::{KreyviumStream, KreyviumStreamByte, KreyviumStreamShortint, TransCiphering};
// Values for these tests come from the github repo renaud1239/Kreyvium,
// commit fd6828f68711276c25f55e605935028f5e843f43

View File

@@ -1,6 +1,5 @@
#[allow(clippy::module_inception)]
mod static_deque;
pub use static_deque::StaticDeque;
mod static_byte_deque;
pub use static_byte_deque::{StaticByteDeque, StaticByteDequeInput};

View File

@@ -4,6 +4,7 @@
//! This is pretending to store bits, and allows accessing bits in chunks of 8 consecutive.
use crate::static_deque::StaticDeque;
use tfhe::FheUint8;
/// Internal trait specifying which operations are needed by StaticByteDeque

View File

@@ -2,11 +2,13 @@
//! when trans ciphering is available to them.
use crate::{KreyviumStreamByte, KreyviumStreamShortint, TriviumStreamByte, TriviumStreamShortint};
use rayon::prelude::*;
use tfhe::prelude::*;
use tfhe::shortint::Ciphertext;
use tfhe::prelude::*;
use tfhe::{set_server_key, unset_server_key, FheUint64, FheUint8, ServerKey};
use rayon::prelude::*;
/// Triat specifying the interface for trans ciphering a FheUint64 object. Since it is meant
/// to be used with stream ciphers, encryption and decryption are by default the same.
pub trait TransCiphering {

View File

@@ -1,7 +1,8 @@
use crate::{TransCiphering, TriviumStream, TriviumStreamByte, TriviumStreamShortint};
use tfhe::prelude::*;
use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
use crate::{TransCiphering, TriviumStream, TriviumStreamByte, TriviumStreamShortint};
// Values for these tests come from the github repo cantora/avr-crypto-lib, commit 2a5b018,
// file testvectors/trivium-80.80.test-vectors

View File

@@ -2,10 +2,12 @@
//! for the representation of the inner bits.
use crate::static_deque::StaticDeque;
use rayon::prelude::*;
use tfhe::prelude::*;
use tfhe::{set_server_key, unset_server_key, FheBool, ServerKey};
use rayon::prelude::*;
/// Internal trait specifying which operations are necessary for TriviumStream generic type
pub trait TriviumBoolInput<OpOutput>:
Sized

View File

@@ -2,10 +2,12 @@
//! for the representation of the inner bits.
use crate::static_deque::{StaticByteDeque, StaticByteDequeInput};
use rayon::prelude::*;
use tfhe::prelude::*;
use tfhe::{set_server_key, unset_server_key, FheUint8, ServerKey};
use rayon::prelude::*;
/// Internal trait specifying which operations are necessary for TriviumStreamByte generic type
pub trait TriviumByteInput<OpOutput>:
Sized

View File

@@ -1,7 +1,9 @@
use crate::static_deque::StaticDeque;
use rayon::prelude::*;
use tfhe::shortint::prelude::*;
use rayon::prelude::*;
/// TriviumStreamShortint: a struct implementing the Trivium stream cipher, using a generic
/// Ciphertext for the internal representation of bits (intended to represent a single bit). To be
/// able to compute FHE operations, it also owns a ServerKey.
@@ -32,7 +34,7 @@ impl TriviumStreamShortint {
let mut c_register: [Ciphertext; 111] = [0; 111].map(|x| sk.create_trivial(x));
for i in 0..80 {
a_register[93 - 80 + i].clone_from(&key[i]);
a_register[93 - 80 + i] = key[i].clone();
b_register[84 - 80 + i] = sk.create_trivial(iv[i]);
}

View File

@@ -1,6 +1,6 @@
[package]
name = "tfhe-cuda-backend"
version = "0.2.0"
version = "0.1.2"
edition = "2021"
authors = ["Zama team"]
license = "BSD-3-Clause-Clear"
@@ -13,4 +13,6 @@ keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
[build-dependencies]
cmake = { version = "0.1" }
pkg-config = { version = "0.3" }
[dependencies]
thiserror = "1.0"

View File

@@ -2,18 +2,7 @@ use std::env;
use std::process::Command;
fn main() {
if let Ok(val) = env::var("DOCS_RS") {
if val.parse::<u32>() == Ok(1) {
return;
}
}
println!("Build tfhe-cuda-backend");
println!("cargo::rerun-if-changed=cuda/include");
println!("cargo::rerun-if-changed=cuda/src");
println!("cargo::rerun-if-changed=cuda/tests_and_benchmarks");
println!("cargo::rerun-if-changed=cuda/CMakeLists.txt");
println!("cargo::rerun-if-changed=src");
if env::consts::OS == "linux" {
let output = Command::new("./get_os_name.sh").output().unwrap();
let distribution = String::from_utf8(output.stdout).unwrap();
@@ -26,15 +15,7 @@ fn main() {
let dest = cmake::build("cuda");
println!("cargo:rustc-link-search=native={}", dest.display());
println!("cargo:rustc-link-lib=static=tfhe_cuda_backend");
// Try to find the cuda libs with pkg-config, default to the path used by the nvidia runfile
if pkg_config::Config::new()
.atleast_version("10")
.probe("cuda")
.is_err()
{
println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
}
println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
println!("cargo:rustc-link-lib=gomp");
println!("cargo:rustc-link-lib=cudart");
println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");

View File

@@ -1,2 +0,0 @@
/build/
include/cuda_config.h

View File

@@ -58,15 +58,10 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler ${OpenMP_CXX_FLAGS}")
if(${CUDA_SUCCESS})
set(CMAKE_CUDA_ARCHITECTURES native)
string(REPLACE "-arch=sm_" "" CUDA_ARCH "${ARCH}")
set(CUDA_ARCH "${CUDA_ARCH}0")
else()
set(CMAKE_CUDA_ARCHITECTURES 70)
set(CUDA_ARCH "700")
endif()
add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})
# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
set(CMAKE_CUDA_FLAGS
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 \
@@ -76,13 +71,10 @@ set(CMAKE_CUDA_FLAGS
set(INCLUDE_DIR include)
add_subdirectory(src)
enable_testing()
add_subdirectory(tests_and_benchmarks)
target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})
# This is required for rust cargo build
install(TARGETS tfhe_cuda_backend DESTINATION .)
install(TARGETS tfhe_cuda_backend DESTINATION lib)
# Define a function to add a lint target.
@@ -94,3 +86,5 @@ if(CPPLINT)
set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_ALL TRUE)
# set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
endif()
enable_testing()

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
set -e
@@ -6,14 +6,14 @@ while getopts ":c" option; do
case $option in
c)
# code to execute when flag1 is provided
find ./{include,src,tests_and_benchmarks/include,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file' --dry-run --Werror
find ./{include,src} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file' --dry-run --Werror
cmake-format -i CMakeLists.txt -c .cmake-format-config.py
find ./{include,src,tests_and_benchmarks/include,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
find ./{include,src} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
git diff --exit-code
exit
;;
esac
done
find ./{include,src,tests_and_benchmarks/include,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
find ./{include,src} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
cmake-format -i CMakeLists.txt -c .cmake-format-config.py
find ./{include,src,tests_and_benchmarks/include,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
find ./{include,src} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'

View File

@@ -0,0 +1,118 @@
#ifndef CUDA_BOOTSTRAP_H
#define CUDA_BOOTSTRAP_H
#include "device.h"
#include <cstdint>
enum PBS_TYPE { MULTI_BIT = 0, LOW_LAT = 1, AMORTIZED = 2 };
extern "C" {
void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
cuda_stream_t *stream,
uint32_t polynomial_size,
uint32_t total_polynomials);
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src,
cuda_stream_t *stream,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src,
cuda_stream_t *stream,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void scratch_cuda_bootstrap_amortized_32(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory);
void scratch_cuda_bootstrap_amortized_64(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory);
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
int8_t **pbs_buffer);
void scratch_cuda_bootstrap_low_latency_32(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void scratch_cuda_bootstrap_low_latency_64(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
void cleanup_cuda_bootstrap_low_latency(cuda_stream_t *stream,
int8_t **pbs_buffer);
uint64_t get_buffer_size_bootstrap_amortized_64(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
uint64_t get_buffer_size_bootstrap_low_latency_64(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
}
#ifdef __CUDACC__
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template <typename T>
__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
int glwe_dimension, uint32_t level_count);
template <typename T>
__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
int glwe_dimension, uint32_t level_count);
template <typename T>
__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
#endif
#endif // CUDA_BOOTSTRAP_H

View File

@@ -0,0 +1,46 @@
#ifndef CUDA_MULTI_BIT_H
#define CUDA_MULTI_BIT_H
#include <cstdint>
extern "C" {
void cuda_convert_lwe_multi_bit_bootstrap_key_64(
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
uint32_t grouping_factor);
void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t chunk_size = 0);
void scratch_cuda_multi_bit_pbs_64(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory,
uint32_t chunk_size = 0);
void cleanup_cuda_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer);
}
#ifdef __CUDACC__
__host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
uint32_t level_count,
uint32_t glwe_dimension,
uint32_t num_samples);
__host__ uint32_t get_average_lwe_chunk_size(uint32_t lwe_dimension,
uint32_t level_count,
uint32_t glwe_dimension,
uint32_t ct_count);
__host__ uint64_t get_max_buffer_size_multibit_bootstrap(
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_input_lwe_ciphertext_count);
#endif
#endif // CUDA_MULTI_BIT_H

View File

@@ -4,14 +4,14 @@
#include <cstdint>
extern "C" {
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
void *v_stream,
uint32_t gpu_index,
void *dest, void *src,
uint32_t number_of_cts,
uint32_t lwe_dimension);
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
void *v_stream,
uint32_t gpu_index,
void *dest, void *src,
uint32_t number_of_cts,
uint32_t lwe_dimension);
};

View File

@@ -6,9 +6,9 @@
#include <cstdlib>
#include <cstring>
#include <cuda_runtime.h>
#include <vector>
#define synchronize_threads_in_block() __syncthreads()
extern "C" {
#define check_cuda_error(ans) \
@@ -27,33 +27,51 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
std::abort(); \
}
cudaStream_t cuda_create_stream(uint32_t gpu_index);
struct cuda_stream_t {
cudaStream_t stream;
uint32_t gpu_index;
void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index);
cuda_stream_t(uint32_t gpu_index) {
this->gpu_index = gpu_index;
void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index);
check_cuda_error(cudaStreamCreate(&stream));
}
void release() {
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaStreamDestroy(stream));
}
void synchronize() { check_cuda_error(cudaStreamSynchronize(stream)); }
};
cuda_stream_t *cuda_create_stream(uint32_t gpu_index);
void cuda_destroy_stream(cuda_stream_t *stream);
void *cuda_malloc(uint64_t size, uint32_t gpu_index);
void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream);
void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
bool cuda_check_support_cooperative_groups();
bool cuda_check_support_thread_block_clusters();
void cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size);
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index);
cuda_stream_t *stream);
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index);
cuda_stream_t *stream);
void cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size);
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index);
cuda_stream_t *stream);
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
cudaStream_t stream, uint32_t gpu_index);
cuda_stream_t *stream);
int cuda_get_number_of_gpus();
@@ -61,18 +79,14 @@ void cuda_synchronize_device(uint32_t gpu_index);
void cuda_drop(void *ptr, uint32_t gpu_index);
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
void cuda_drop_async(void *ptr, cuda_stream_t *stream);
int cuda_get_max_shared_memory(uint32_t gpu_index);
void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
cudaStreamCallback_t callback, void *user_data);
void cuda_synchronize_stream(cuda_stream_t *stream);
}
void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
void *host_pointer);
template <typename Torus>
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
Torus *d_array, Torus value, Torus n);
void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
Torus n);
#endif

View File

@@ -1,18 +0,0 @@
#ifndef HELPER_MULTI_GPU_H
#define HELPER_MULTI_GPU_H
#include <mutex>
extern std::mutex m;
extern bool p2p_enabled;
extern "C" {
int cuda_setup_multi_gpu();
}
int get_active_gpu_count(int num_inputs, int gpu_count);
int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
int get_gpu_offset(int total_num_inputs, int gpu_index, int gpu_count);
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -6,18 +6,16 @@
extern "C" {
void cuda_keyswitch_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t gpu_offset = 0);
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
void cuda_keyswitch_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t gpu_offset = 0);
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
}
#endif // CNCRT_KS_H_

View File

@@ -1,48 +1,48 @@
#ifndef CUDA_LINALG_H_
#define CUDA_LINALG_H_
#include "programmable_bootstrap.h"
#include "bootstrap.h"
#include <cstdint>
#include <device.h>
extern "C" {
void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
void cuda_negate_lwe_ciphertext_vector_32(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
void cuda_negate_lwe_ciphertext_vector_64(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
void cuda_add_lwe_ciphertext_vector_32(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
void cuda_add_lwe_ciphertext_vector_64(cuda_stream_t *stream,
void *lwe_array_out,
void *lwe_array_in_1,
void *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
}

View File

@@ -1,431 +0,0 @@
#ifndef CUDA_BOOTSTRAP_H
#define CUDA_BOOTSTRAP_H
#include "device.h"
#include <cstdint>
enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };
extern "C" {
void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
void *input1, void *input2, void *output,
uint32_t polynomial_size,
uint32_t total_polynomials);
void cuda_convert_lwe_programmable_bootstrap_key_32(
void *stream, uint32_t gpu_index, void *dest, void *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void cuda_convert_lwe_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void scratch_cuda_programmable_bootstrap_amortized_32(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void scratch_cuda_programmable_bootstrap_amortized_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory);
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t gpu_offset = 0);
void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
uint32_t gpu_index,
int8_t **pbs_buffer);
void scratch_cuda_programmable_bootstrap_32(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void scratch_cuda_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t gpu_offset = 0);
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t gpu_offset = 0);
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
int8_t **pbs_buffer);
uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
uint64_t get_buffer_size_programmable_bootstrap_64(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_step_one(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_step_two(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_programmable_bootstrap_tbc(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // tbc
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
}
template <typename Torus>
__host__ bool
supports_distributed_shared_memory_on_classic_programmable_bootstrap(
uint32_t polynomial_size, uint32_t max_shared_memory);
template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
int8_t *d_mem;
Torus *global_accumulator;
double2 *global_accumulator_fft;
PBS_VARIANT pbs_variant;
pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
bool allocate_gpu_memory) {
this->pbs_variant = pbs_variant;
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
if (allocate_gpu_memory) {
switch (pbs_variant) {
case PBS_VARIANT::DEFAULT: {
uint64_t full_sm_step_one =
get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
polynomial_size);
uint64_t full_sm_step_two =
get_buffer_size_full_sm_programmable_bootstrap_step_two<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap<Torus>(
polynomial_size);
uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
uint64_t full_dm = full_sm_step_one;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm_step_two) {
device_mem =
(partial_dm_step_two + partial_dm_step_one * level_count) *
input_lwe_ciphertext_count * (glwe_dimension + 1);
} else if (max_shared_memory < full_sm_step_one) {
device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
level_count * (glwe_dimension + 1);
}
// Otherwise, both kernels run all in shared memory
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
global_accumulator_fft = (double2 *)cuda_malloc_async(
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
(polynomial_size / 2) * sizeof(double2),
stream, gpu_index);
global_accumulator = (Torus *)cuda_malloc_async(
(glwe_dimension + 1) * input_lwe_ciphertext_count *
polynomial_size * sizeof(Torus),
stream, gpu_index);
} break;
case PBS_VARIANT::CG: {
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
polynomial_size);
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm) {
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
}
// Otherwise, both kernels run all in shared memory
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
global_accumulator_fft = (double2 *)cuda_malloc_async(
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double2),
stream, gpu_index);
} break;
#if CUDA_ARCH >= 900
case PBS_VARIANT::TBC: {
bool supports_dsm =
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
Torus>(polynomial_size, max_shared_memory);
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
polynomial_size);
uint64_t minimum_sm_tbc = 0;
if (supports_dsm)
minimum_sm_tbc =
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<
Torus>(polynomial_size);
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
uint64_t device_mem = 0;
// There is a minimum amount of memory we need to run the TBC PBS, which
// is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
// because otherwise the previous check would have redirected
// computation to some other variant. If over that we don't have more
// partial_sm bytes, TBC PBS will run on NOSM. If we have partial_sm but
// not full_sm bytes, it will run on PARTIALSM. Otherwise, FULLSM.
//
// NOSM mode actually requires minimum_sm_tbc shared memory bytes.
if (max_shared_memory < partial_sm + minimum_sm_tbc) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
}
// Otherwise, both kernels run all in shared memory
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
global_accumulator_fft = (double2 *)cuda_malloc_async(
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double2),
stream, gpu_index);
} break;
#endif
default:
PANIC("Cuda error (PBS): unsupported implementation variant.")
}
}
}
void release(cudaStream_t stream, uint32_t gpu_index) {
cuda_drop_async(d_mem, stream, gpu_index);
cuda_drop_async(global_accumulator_fft, stream, gpu_index);
if (pbs_variant == DEFAULT)
cuda_drop_async(global_accumulator, stream, gpu_index);
}
};
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap_cg(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
polynomial_size);
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm) {
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
}
uint64_t buffer_size = device_mem + (glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double2);
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t num_samples,
uint32_t max_shared_memory);
template <typename Torus>
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
template <typename Torus>
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
#if (CUDA_ARCH >= 900)
template <typename Torus>
void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
template <typename Torus, typename STorus>
void scratch_cuda_programmable_bootstrap_tbc(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
#endif
template <typename Torus, typename STorus>
void scratch_cuda_programmable_bootstrap_cg(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
template <typename Torus, typename STorus>
void scratch_cuda_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t max_shared_memory);
#ifdef __CUDACC__
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template <typename T>
__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
int glwe_dimension, uint32_t level_count);
template <typename T>
__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
int glwe_dimension, uint32_t level_count);
template <typename T>
__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
#endif
#endif // CUDA_BOOTSTRAP_H

View File

@@ -1,324 +0,0 @@
#ifndef CUDA_MULTI_BIT_H
#define CUDA_MULTI_BIT_H
#include "programmable_bootstrap.h"
#include <cstdint>
extern "C" {
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t num_samples, uint32_t max_shared_memory);
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size, uint32_t grouping_factor);
void scratch_cuda_multi_bit_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t chunk_size = 0);
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset,
uint32_t lwe_chunk_size = 0);
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
uint32_t gpu_index,
int8_t **pbs_buffer);
}
template <typename Torus>
__host__ bool
supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
uint32_t polynomial_size, uint32_t max_shared_memory);
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory);
#if CUDA_ARCH >= 900
template <typename Torus, typename STorus>
void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
template <typename Torus>
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t gpu_offset, uint32_t lwe_chunk_size);
#endif
template <typename Torus, typename STorus>
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
template <typename Torus, typename STorus>
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
template <typename Torus>
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t gpu_offset, uint32_t lwe_chunk_size = 0);
template <typename Torus, typename STorus>
void scratch_cuda_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
template <typename Torus>
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t gpu_offset, uint32_t lwe_chunk_size = 0);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
int8_t *d_mem_keybundle = NULL;
int8_t *d_mem_acc_step_one = NULL;
int8_t *d_mem_acc_step_two = NULL;
int8_t *d_mem_acc_cg = NULL;
int8_t *d_mem_acc_tbc = NULL;
double2 *keybundle_fft;
Torus *global_accumulator;
double2 *global_accumulator_fft;
PBS_VARIANT pbs_variant;
pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
this->pbs_variant = pbs_variant;
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
// default
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<
Torus>(polynomial_size);
uint64_t full_sm_accumulate_step_one =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
polynomial_size);
uint64_t full_sm_accumulate_step_two =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
polynomial_size);
uint64_t partial_sm_accumulate_step_one =
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
Torus>(polynomial_size);
// cg
uint64_t full_sm_cg_accumulate =
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
polynomial_size);
uint64_t partial_sm_cg_accumulate =
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
polynomial_size);
auto num_blocks_keybundle = input_lwe_ciphertext_count * lwe_chunk_size *
(glwe_dimension + 1) * (glwe_dimension + 1) *
level_count;
auto num_blocks_acc_step_one =
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
auto num_blocks_acc_step_two =
input_lwe_ciphertext_count * (glwe_dimension + 1);
auto num_blocks_acc_cg =
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
#if CUDA_ARCH >= 900
uint64_t full_sm_tbc_accumulate =
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
uint64_t partial_sm_tbc_accumulate =
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
uint64_t minimum_sm_tbc =
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
polynomial_size);
auto num_blocks_acc_tbc = num_blocks_acc_cg;
#endif
if (allocate_gpu_memory) {
// Keybundle
if (max_shared_memory < full_sm_keybundle)
d_mem_keybundle = (int8_t *)cuda_malloc_async(
num_blocks_keybundle * full_sm_keybundle, stream, gpu_index);
switch (pbs_variant) {
case PBS_VARIANT::CG:
// Accumulator CG
if (max_shared_memory < partial_sm_cg_accumulate)
d_mem_acc_cg = (int8_t *)cuda_malloc_async(
num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index);
else if (max_shared_memory < full_sm_cg_accumulate)
d_mem_acc_cg = (int8_t *)cuda_malloc_async(
num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index);
break;
case PBS_VARIANT::DEFAULT:
// Accumulator step one
if (max_shared_memory < partial_sm_accumulate_step_one)
d_mem_acc_step_one = (int8_t *)cuda_malloc_async(
num_blocks_acc_step_one * full_sm_accumulate_step_one, stream,
gpu_index);
else if (max_shared_memory < full_sm_accumulate_step_one)
d_mem_acc_step_one = (int8_t *)cuda_malloc_async(
num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream,
gpu_index);
// Accumulator step two
if (max_shared_memory < full_sm_accumulate_step_two)
d_mem_acc_step_two = (int8_t *)cuda_malloc_async(
num_blocks_acc_step_two * full_sm_accumulate_step_two, stream,
gpu_index);
break;
#if CUDA_ARCH >= 900
case TBC:
// There is a minimum amount of memory we need to run the TBC PBS, which
// is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
// because otherwise the previous check would have redirected
// computation to some other variant. If over that we don't have more
// partial_sm_tbc_accumulate bytes, TBC PBS will run on NOSM. If we have
// partial_sm_tbc_accumulate but not full_sm_tbc_accumulate bytes, it
// will run on PARTIALSM. Otherwise, FULLSM.
//
// NOSM mode actually requires minimum_sm_tbc shared memory bytes.
// Accumulator TBC
if (max_shared_memory < partial_sm_tbc_accumulate + minimum_sm_tbc)
d_mem_acc_tbc = (int8_t *)cuda_malloc_async(
num_blocks_acc_tbc * full_sm_tbc_accumulate, stream, gpu_index);
else if (max_shared_memory < full_sm_tbc_accumulate + minimum_sm_tbc)
d_mem_acc_tbc = (int8_t *)cuda_malloc_async(
num_blocks_acc_tbc * partial_sm_tbc_accumulate, stream,
gpu_index);
break;
#endif
default:
PANIC("Cuda error (PBS): unsupported implementation variant.")
}
keybundle_fft = (double2 *)cuda_malloc_async(
num_blocks_keybundle * (polynomial_size / 2) * sizeof(double2),
stream, gpu_index);
global_accumulator = (Torus *)cuda_malloc_async(
num_blocks_acc_step_one * polynomial_size * sizeof(Torus), stream,
gpu_index);
global_accumulator_fft = (double2 *)cuda_malloc_async(
num_blocks_acc_step_one * (polynomial_size / 2) * sizeof(double2),
stream, gpu_index);
}
}
void release(cudaStream_t stream, uint32_t gpu_index) {
if (d_mem_keybundle)
cuda_drop_async(d_mem_keybundle, stream, gpu_index);
switch (pbs_variant) {
case DEFAULT:
if (d_mem_acc_step_one)
cuda_drop_async(d_mem_acc_step_one, stream, gpu_index);
if (d_mem_acc_step_two)
cuda_drop_async(d_mem_acc_step_two, stream, gpu_index);
break;
case CG:
if (d_mem_acc_cg)
cuda_drop_async(d_mem_acc_cg, stream, gpu_index);
break;
#if CUDA_ARCH >= 900
case TBC:
if (d_mem_acc_tbc)
cuda_drop_async(d_mem_acc_tbc, stream, gpu_index);
break;
#endif
default:
PANIC("Cuda error (PBS): unsupported implementation variant.")
}
cuda_drop_async(keybundle_fft, stream, gpu_index);
cuda_drop_async(global_accumulator, stream, gpu_index);
cuda_drop_async(global_accumulator_fft, stream, gpu_index);
}
};
template <typename Torus, class params>
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size,
uint32_t max_shared_memory);
#endif // CUDA_MULTI_BIT_H

View File

@@ -10,8 +10,7 @@ set(SOURCES
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h)
file(GLOB_RECURSE SOURCES "*.cu")
add_library(tfhe_cuda_backend STATIC ${SOURCES})
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)

View File

@@ -1,21 +1 @@
#include "ciphertext.cuh"
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
uint32_t gpu_index,
void *dest, void *src,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cuda_convert_lwe_ciphertext_vector_to_gpu<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)dest,
(uint64_t *)src, number_of_cts, lwe_dimension);
}
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
uint32_t gpu_index,
void *dest, void *src,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cuda_convert_lwe_ciphertext_vector_to_cpu<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)dest,
(uint64_t *)src, number_of_cts, lwe_dimension);
}

View File

@@ -6,23 +6,39 @@
#include <cstdint>
template <typename T>
void cuda_convert_lwe_ciphertext_vector_to_gpu(cudaStream_t stream,
uint32_t gpu_index, T *dest,
T *src, uint32_t number_of_cts,
void cuda_convert_lwe_ciphertext_vector_to_gpu(T *dest, T *src,
cuda_stream_t *stream,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index);
cuda_memcpy_async_to_gpu(dest, src, size, stream);
}
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
cuda_stream_t *stream,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cuda_convert_lwe_ciphertext_vector_to_gpu<uint64_t>(
(uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
}
template <typename T>
void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
uint32_t gpu_index, T *dest,
T *src, uint32_t number_of_cts,
void cuda_convert_lwe_ciphertext_vector_to_cpu(T *dest, T *src,
cuda_stream_t *stream,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
cuda_memcpy_async_to_cpu(dest, src, size, stream);
}
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
cuda_stream_t *stream,
uint32_t number_of_cts,
uint32_t lwe_dimension) {
cuda_convert_lwe_ciphertext_vector_to_cpu<uint64_t>(
(uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
}
#endif

View File

@@ -20,7 +20,9 @@ private:
uint32_t level_count;
uint32_t base_log;
uint32_t mask;
uint32_t halfbg;
uint32_t num_poly;
T offset;
int current_level;
T mask_mod_b;
T *state;
@@ -80,12 +82,72 @@ public:
synchronize_threads_in_block();
}
// Decomposes a single polynomial
__device__ void
decompose_and_compress_next_polynomial_elements(double2 *result, int j) {
if (j == 0)
current_level -= 1;
int tid = threadIdx.x;
auto state_slice = state + j * params::degree;
for (int i = 0; i < params::opt / 2; i++) {
T res_re = state_slice[tid] & mask_mod_b;
T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
state_slice[tid] >>= base_log;
state_slice[tid + params::degree / 2] >>= base_log;
T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
T carry_im =
((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
carry_re >>= (base_log - 1);
carry_im >>= (base_log - 1);
state_slice[tid] += carry_re;
state_slice[tid + params::degree / 2] += carry_im;
res_re -= carry_re << base_log;
res_im -= carry_im << base_log;
result[i].x = (int32_t)res_re;
result[i].y = (int32_t)res_im;
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
}
__device__ void decompose_and_compress_level(double2 *result, int level) {
for (int i = 0; i < level_count - level; i++)
decompose_and_compress_next(result);
}
};
template <typename T> class GadgetMatrixSingle {
private:
uint32_t level_count;
uint32_t base_log;
uint32_t mask;
uint32_t halfbg;
T offset;
public:
__device__ GadgetMatrixSingle(uint32_t base_log, uint32_t level_count)
: base_log(base_log), level_count(level_count) {
uint32_t bg = 1 << base_log;
this->halfbg = bg / 2;
this->mask = bg - 1;
T temp = 0;
for (int i = 0; i < this->level_count; i++) {
temp += 1ULL << (sizeof(T) * 8 - (i + 1) * this->base_log);
}
this->offset = temp * this->halfbg;
}
__device__ T decompose_one_level_single(T element, uint32_t level) {
T s = element + this->offset;
uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
T temp1 = (s >> decal) & this->mask;
return (T)(temp1 - this->halfbg);
}
};
template <typename Torus>
__device__ Torus decompose_one(Torus &state, Torus mask_mod_b, int base_log) {
Torus res = state & mask_mod_b;

View File

@@ -49,15 +49,11 @@ __global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src,
* global memory
*/
template <typename T, typename ST, class params>
void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, double2 *dest, T *src,
void batch_fft_ggsw_vector(cuda_stream_t *stream, double2 *dest, T *src,
int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
uint32_t polynomial_size, uint32_t level_count,
uint32_t max_shared_memory) {
if (gpu_count != 1)
PANIC("GPU error (batch_fft_ggsw_vector): multi-GPU execution is not "
"supported yet.")
cudaSetDevice(gpu_indexes[0]);
uint32_t gpu_index, uint32_t max_shared_memory) {
cudaSetDevice(stream->gpu_index);
int shared_memory_size = sizeof(double) * polynomial_size;
@@ -66,11 +62,11 @@ void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
if (max_shared_memory < shared_memory_size) {
device_batch_fft_ggsw_vector<T, ST, params, NOSM>
<<<gridSize, blockSize, 0, streams[0]>>>(dest, src, d_mem);
<<<gridSize, blockSize, 0, stream->stream>>>(dest, src, d_mem);
} else {
device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
<<<gridSize, blockSize, shared_memory_size, streams[0]>>>(dest, src,
d_mem);
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(dest, src,
d_mem);
}
check_cuda_error(cudaGetLastError());
}

View File

@@ -6,19 +6,16 @@
* Head out to the equivalent operation on 64 bits for more details.
*/
void cuda_keyswitch_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t gpu_offset) {
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
stream, static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes), static_cast<uint32_t *>(ksk),
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples,
gpu_offset);
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
}
/* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
@@ -38,17 +35,14 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
* - num_samples blocks of threads are launched
*/
void cuda_keyswitch_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t gpu_offset) {
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples,
gpu_offset);
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
}

View File

@@ -3,11 +3,8 @@
#include "device.h"
#include "gadget.cuh"
#include "helper_multi_gpu.h"
#include "polynomial/functions.cuh"
#include "polynomial/polynomial_math.cuh"
#include "torus.cuh"
#include "utils/kernel_dimensions.cuh"
#include <thread>
#include <vector>
@@ -34,128 +31,114 @@ __device__ Torus *get_ith_block(Torus *ksk, int i, int level,
* scaling factor) under key s2 instead of s1, with an increased noise
*
*/
// Each thread in x are used to calculate one output.
// threads in y are used to paralelize the lwe_dimension_in loop.
// shared memory is used to store intermediate results of the reduction.
template <typename Torus>
__global__ void keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes,
Torus *ksk, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, int gpu_offset) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int shmem_index = threadIdx.x + threadIdx.y * blockDim.x;
__global__ void
keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, Torus *ksk, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
int lwe_lower, int lwe_upper, int cutoff) {
int tid = threadIdx.x;
extern __shared__ int8_t sharedmem[];
Torus *lwe_acc_out = (Torus *)sharedmem;
auto block_lwe_array_out =
get_chunk(lwe_array_out, lwe_output_indexes[blockIdx.y + gpu_offset],
lwe_dimension_out + 1);
if (tid <= lwe_dimension_out) {
Torus *local_lwe_array_out = (Torus *)sharedmem;
Torus local_lwe_out = 0;
auto block_lwe_array_in =
get_chunk(lwe_array_in, lwe_input_indexes[blockIdx.y + gpu_offset],
lwe_dimension_in + 1);
auto block_lwe_array_in = get_chunk(
lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);
auto block_lwe_array_out = get_chunk(
lwe_array_out, lwe_output_indexes[blockIdx.x], lwe_dimension_out + 1);
if (tid == lwe_dimension_out && threadIdx.y == 0) {
local_lwe_out = block_lwe_array_in[lwe_dimension_in];
}
const Torus mask_mod_b = (1ll << base_log) - 1ll;
auto gadget = GadgetMatrixSingle<Torus>(base_log, level_count);
const int pack_size = (lwe_dimension_in + blockDim.y - 1) / blockDim.y;
const int start_i = pack_size * threadIdx.y;
const int end_i = SEL(lwe_dimension_in, pack_size * (threadIdx.y + 1),
pack_size * (threadIdx.y + 1) <= lwe_dimension_in);
int lwe_part_per_thd;
if (tid < cutoff) {
lwe_part_per_thd = lwe_upper;
} else {
lwe_part_per_thd = lwe_lower;
}
__syncthreads();
// This loop distribution seems to benefit the global mem reads
for (int i = start_i; i < end_i; i++) {
Torus a_i = round_to_closest_multiple(block_lwe_array_in[i], base_log,
level_count);
Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
for (int k = 0; k < lwe_part_per_thd; k++) {
int idx = tid + k * blockDim.x;
local_lwe_array_out[idx] = 0;
}
__syncthreads();
for (int j = 0; j < level_count; j++) {
auto ksk_block =
get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
local_lwe_out -= (Torus)ksk_block[tid] * decomposed;
}
}
lwe_acc_out[shmem_index] = local_lwe_out;
if (tid == 0) {
local_lwe_array_out[lwe_dimension_out] =
block_lwe_array_in[lwe_dimension_in];
}
if (tid <= lwe_dimension_out) {
for (int offset = blockDim.y / 2; offset > 0 && threadIdx.y < offset;
offset /= 2) {
__syncthreads();
lwe_acc_out[shmem_index] +=
lwe_acc_out[shmem_index + offset * blockDim.x];
for (int i = 0; i < lwe_dimension_in; i++) {
__syncthreads();
Torus a_i =
round_to_closest_multiple(block_lwe_array_in[i], base_log, level_count);
Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
Torus mask_mod_b = (1ll << base_log) - 1ll;
for (int j = 0; j < level_count; j++) {
auto ksk_block = get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
for (int k = 0; k < lwe_part_per_thd; k++) {
int idx = tid + k * blockDim.x;
local_lwe_array_out[idx] -= (Torus)ksk_block[idx] * decomposed;
}
}
if (threadIdx.y == 0)
block_lwe_array_out[tid] = lwe_acc_out[shmem_index];
}
for (int k = 0; k < lwe_part_per_thd; k++) {
int idx = tid + k * blockDim.x;
block_lwe_array_out[idx] = local_lwe_array_out[idx];
}
}
/// assume lwe_array_in in the gpu
template <typename Torus>
__host__ void cuda_keyswitch_lwe_ciphertext_vector(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t gpu_offset = 0) {
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *ksk,
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, uint32_t num_samples) {
cudaSetDevice(gpu_index);
cudaSetDevice(stream->gpu_index);
constexpr int ideal_threads = 128;
constexpr int num_threads_y = 32;
int num_blocks, num_threads_x;
int lwe_dim = lwe_dimension_out + 1;
int lwe_lower, lwe_upper, cutoff;
if (lwe_dim % ideal_threads == 0) {
lwe_lower = lwe_dim / ideal_threads;
lwe_upper = lwe_dim / ideal_threads;
cutoff = 0;
} else {
int y =
ceil((double)lwe_dim / (double)ideal_threads) * ideal_threads - lwe_dim;
cutoff = ideal_threads - y;
lwe_lower = lwe_dim / ideal_threads;
lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
}
getNumBlocksAndThreads2D(lwe_dimension_out + 1, 512, num_threads_y,
num_blocks, num_threads_x);
int lwe_size_after = (lwe_dimension_out + 1) * num_samples;
int shared_mem = sizeof(Torus) * num_threads_y * num_threads_x;
dim3 grid(num_blocks, num_samples, 1);
dim3 threads(num_threads_x, num_threads_y, 1);
int shared_mem = sizeof(Torus) * (lwe_dimension_out + 1);
keyswitch<Torus><<<grid, threads, shared_mem, stream>>>(
cuda_memset_async(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, stream);
check_cuda_error(cudaGetLastError());
dim3 grid(num_samples, 1, 1);
dim3 threads(ideal_threads, 1, 1);
// cudaFuncSetAttribute(keyswitch<Torus>,
// cudaFuncAttributeMaxDynamicSharedMemorySize,
// shared_mem);
keyswitch<<<grid, threads, shared_mem, stream->stream>>>(
lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
lwe_dimension_in, lwe_dimension_out, base_log, level_count, gpu_offset);
lwe_dimension_in, lwe_dimension_out, base_log, level_count, lwe_lower,
lwe_upper, cutoff);
check_cuda_error(cudaGetLastError());
}
template <typename Torus>
void execute_keyswitch(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, Torus **ksks,
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count,
uint32_t num_samples, bool sync_streams = true) {
/// If the number of radix blocks is lower than the number of GPUs, not all
/// GPUs will be active and there will be 1 input per GPU
auto active_gpu_count = get_active_gpu_count(num_samples, gpu_count);
int num_samples_on_gpu_0 = get_num_inputs_on_gpu(num_samples, 0, gpu_count);
if (sync_streams)
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
#pragma omp parallel for num_threads(active_gpu_count)
for (uint i = 0; i < active_gpu_count; i++) {
int num_samples_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
int gpu_offset = get_gpu_offset(num_samples, i, gpu_count);
// Compute Keyswitch
cuda_keyswitch_lwe_ciphertext_vector<Torus>(
streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
lwe_array_in, lwe_input_indexes, ksks[i], lwe_dimension_in,
lwe_dimension_out, base_log, level_count, num_samples_on_gpu,
gpu_offset);
}
if (sync_streams)
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
#endif

View File

@@ -3,23 +3,14 @@
#include <cuda_runtime.h>
/// Unsafe function to create a CUDA stream, must check first that GPU exists
cudaStream_t cuda_create_stream(uint32_t gpu_index) {
cuda_stream_t *cuda_create_stream(uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
cudaStream_t stream;
check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
cuda_stream_t *stream = new cuda_stream_t(gpu_index);
return stream;
}
/// Unsafe function to destroy CUDA stream, must check first the GPU exists
void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaStreamDestroy(stream));
}
void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaStreamSynchronize(stream));
}
void cuda_destroy_stream(cuda_stream_t *stream) { stream->release(); }
/// Unsafe function that will try to allocate even if gpu_index is invalid
/// or if there's not enough memory. A safe wrapper around it must call
@@ -34,20 +25,20 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
/// Allocates a size-byte array at the device memory. Tries to do it
/// asynchronously.
void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) {
check_cuda_error(cudaSetDevice(stream->gpu_index));
void *ptr;
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#elif (CUDART_VERSION >= 11020)
int support_async_alloc;
check_cuda_error(cudaDeviceGetAttribute(
&support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
cudaDevAttrMemoryPoolsSupported,
stream->gpu_index));
if (support_async_alloc) {
check_cuda_error(cudaMallocAsync((void **)&ptr, size, stream));
check_cuda_error(cudaMallocAsync((void **)&ptr, size, stream->stream));
} else {
check_cuda_error(cudaMalloc((void **)&ptr, size));
}
@@ -65,7 +56,7 @@ void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
if (size > free_mem) {
PANIC("Cuda error: not enough memory on device. "
"Available: %zu vs Requested: %lu",
free_mem, size)
free_mem, size);
}
}
@@ -80,61 +71,70 @@ bool cuda_check_support_cooperative_groups() {
return cooperative_groups_supported > 0;
}
/// Returns
/// false if Thread Block Cluster is not supported.
/// true otherwise
bool cuda_check_support_thread_block_clusters() {
#if CUDA_ARCH >= 900
// To-do: Is this really the best way to check support?
int tbc_supported = 0;
check_cuda_error(
cudaDeviceGetAttribute(&tbc_supported, cudaDevAttrClusterLaunch, 0));
return tbc_supported > 0;
#else
return false;
#endif
}
/// Copy memory to the GPU asynchronously
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index) {
cuda_stream_t *stream) {
if (size == 0)
return;
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid device pointer in async copy to GPU.")
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid device pointer in async copy to GPU.");
}
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaSetDevice(stream->gpu_index));
check_cuda_error(
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream));
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream->stream));
}
/// Copy memory to the GPU synchronously
void cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size) {
if (size == 0)
return;
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
if (attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid device pointer in copy to GPU.");
}
check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyHostToDevice));
}
/// Copy memory to the CPU synchronously
void cuda_memcpy_to_cpu(void *dest, void *src, uint64_t size) {
if (size == 0)
return;
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, src));
if (attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid device pointer in copy to CPU.");
}
check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToHost));
}
/// Copy memory within a GPU asynchronously
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index) {
cuda_stream_t *stream) {
if (size == 0)
return;
cudaPointerAttributes attr_dest;
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
if (attr_dest.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
if (attr_dest.device != stream->gpu_index &&
attr_dest.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.");
}
cudaPointerAttributes attr_src;
check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
if (attr_src.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
if (attr_src.device != stream->gpu_index &&
attr_src.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.");
}
check_cuda_error(cudaSetDevice(gpu_index));
if (attr_src.device == attr_dest.device) {
check_cuda_error(
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
} else {
check_cuda_error(cudaMemcpyPeerAsync(dest, attr_dest.device, src,
attr_src.device, size, stream));
if (attr_src.device != attr_dest.device) {
PANIC("Cuda error: different devices specified in copy from GPU to GPU.");
}
check_cuda_error(cudaSetDevice(stream->gpu_index));
check_cuda_error(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice,
stream->stream));
}
/// Synchronizes device
@@ -144,16 +144,16 @@ void cuda_synchronize_device(uint32_t gpu_index) {
}
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
cudaStream_t stream, uint32_t gpu_index) {
cuda_stream_t *stream) {
if (size == 0)
return;
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid dest device pointer in cuda memset.")
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid dest device pointer in cuda memset.");
}
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
check_cuda_error(cudaSetDevice(stream->gpu_index));
check_cuda_error(cudaMemsetAsync(dest, val, size, stream->stream));
}
template <typename Torus>
@@ -164,45 +164,42 @@ __global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
}
template <typename Torus>
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
Torus *d_array, Torus value, Torus n) {
void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
Torus n) {
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, d_array));
if (attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid dest device pointer in cuda set value.")
PANIC("Cuda error: invalid dest device pointer in cuda set value.");
}
check_cuda_error(cudaSetDevice(gpu_index));
int block_size = 256;
int num_blocks = (n + block_size - 1) / block_size;
// Launch the kernel
cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
n);
cuda_set_value_kernel<<<num_blocks, block_size, 0, *stream>>>(d_array, value,
n);
check_cuda_error(cudaGetLastError());
}
/// Explicitly instantiate cuda_set_value_async for 32 and 64 bits
template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
uint64_t *d_array, uint64_t value,
uint64_t n);
template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
uint32_t *d_array, uint32_t value,
uint32_t n);
template void cuda_set_value_async(cudaStream_t *stream, uint64_t *d_array,
uint64_t value, uint64_t n);
template void cuda_set_value_async(cudaStream_t *stream, uint32_t *d_array,
uint32_t value, uint32_t n);
/// Copy memory to the CPU asynchronously
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index) {
cuda_stream_t *stream) {
if (size == 0)
return;
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, src));
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid src device pointer in copy to CPU async.");
}
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaSetDevice(stream->gpu_index));
check_cuda_error(
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream->stream));
}
/// Return number of GPUs available
@@ -219,18 +216,19 @@ void cuda_drop(void *ptr, uint32_t gpu_index) {
}
/// Drop a cuda array asynchronously, if supported on the device
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
void cuda_drop_async(void *ptr, cuda_stream_t *stream) {
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaSetDevice(stream->gpu_index));
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#elif (CUDART_VERSION >= 11020)
int support_async_alloc;
check_cuda_error(cudaDeviceGetAttribute(
&support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
cudaDevAttrMemoryPoolsSupported,
stream->gpu_index));
if (support_async_alloc) {
check_cuda_error(cudaFreeAsync(ptr, stream));
check_cuda_error(cudaFreeAsync(ptr, stream->stream));
} else {
check_cuda_error(cudaFree(ptr));
}
@@ -242,21 +240,15 @@ void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
/// Get the maximum size for the shared memory
int cuda_get_max_shared_memory(uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
cudaDeviceProp prop;
check_cuda_error(cudaGetDeviceProperties(&prop, gpu_index));
int max_shared_memory = 0;
cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
gpu_index);
check_cuda_error(cudaGetLastError());
if (prop.major >= 6) {
max_shared_memory = prop.sharedMemPerMultiprocessor;
} else {
max_shared_memory = prop.sharedMemPerBlock;
}
return max_shared_memory;
}
void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
cudaStreamCallback_t callback, void *user_data) {
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaStreamAddCallback(stream, callback, user_data, 0));
}
void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
void *host_pointer) {
free(host_pointer);
}
void cuda_synchronize_stream(cuda_stream_t *stream) { stream->synchronize(); }

View File

@@ -181,7 +181,7 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
// from level 8, we need to check size of params degree, because we support
// minimum actual polynomial size = 256, when compressed size is halfed and
// minimum supported compressed size is 128, so we always need first 7
// levels of butterfly operation, since butterfly levels are hardcoded
// levels of butterfy operation, since butterfly levels are hardcoded
// we need to check if polynomial size is big enough to require specific level
// of butterfly.
if constexpr (params::degree >= 256) {
@@ -353,7 +353,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
// compressed size = 8192 is actual polynomial size = 16384.
// twiddles for this size can't fit in constant memory so
// butterfly operation for this level access device memory to fetch
// butterfly operation for this level acess device memory to fetch
// twiddles
if constexpr (params::degree >= 8192) {
// level 13
@@ -484,7 +484,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
// below level 8, we don't need to check size of params degree, because we
// support minimum actual polynomial size = 256, when compressed size is
// halfed and minimum supported compressed size is 128, so we always need
// last 7 levels of butterfly operation, since butterfly levels are hardcoded
// last 7 levels of butterfy operation, since butterfly levels are hardcoded
// we don't need to check if polynomial size is big enough to require
// specific level of butterfly.
// level 7

View File

@@ -3,7 +3,7 @@
/*
* 'negtwiddles' are stored in constant memory for faster access times
* because of it's limited size, only twiddles for up to 2^12 polynomial size
* because of it's limitied size, only twiddles for up to 2^12 polynomial size
* can be stored there, twiddles for 2^13 are stored in device memory
* 'negtwiddles13'
*/

View File

@@ -1,13 +1,13 @@
#include "integer/bitwise_ops.cuh"
void scratch_cuda_integer_radix_bitop_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
BITOP_TYPE op_type, bool allocate_gpu_memory) {
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -15,42 +15,37 @@ void scratch_cuda_integer_radix_bitop_kb_64(
message_modulus, carry_modulus);
scratch_cuda_integer_radix_bitop_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_bitop_buffer<uint64_t> **)mem_ptr, lwe_ciphertext_count, params,
op_type, allocate_gpu_memory);
stream, (int_bitop_buffer<uint64_t> **)mem_ptr, lwe_ciphertext_count,
params, op_type, allocate_gpu_memory);
}
void cuda_bitop_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
void **bsks, void **ksks, uint32_t lwe_ciphertext_count) {
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
uint32_t lwe_ciphertext_count) {
host_integer_radix_bitop_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_1),
static_cast<uint64_t *>(lwe_array_2),
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
(int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
lwe_ciphertext_count);
}
void cuda_bitnot_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_in, int8_t *mem_ptr, void **bsks,
void **ksks, uint32_t lwe_ciphertext_count) {
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
int8_t *mem_ptr, void *bsk, void *ksk, uint32_t lwe_ciphertext_count) {
host_integer_radix_bitnot_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
(int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
lwe_ciphertext_count);
}
void cleanup_cuda_integer_bitop(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void) {
void cleanup_cuda_integer_bitop(cuda_stream_t *stream, int8_t **mem_ptr_void) {
int_bitop_buffer<uint64_t> *mem_ptr =
(int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(stream);
}

View File

@@ -5,49 +5,47 @@
#include "device.h"
#include "integer.cuh"
#include "integer.h"
#include "pbs/programmable_bootstrap_classic.cuh"
#include "pbs/programmable_bootstrap_multibit.cuh"
#include "pbs/bootstrap_low_latency.cuh"
#include "pbs/bootstrap_multibit.cuh"
#include "polynomial/functions.cuh"
#include "utils/kernel_dimensions.cuh"
#include <omp.h>
template <typename Torus>
__host__ void
host_integer_radix_bitop_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
host_integer_radix_bitop_kb(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_1, Torus *lwe_array_2,
int_bitop_buffer<Torus> *mem_ptr, void **bsks,
Torus **ksks, uint32_t num_radix_blocks) {
int_bitop_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
auto lut = mem_ptr->lut;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2,
bsks, ksks, num_radix_blocks, lut, lut->params.message_modulus);
}
template <typename Torus>
__host__ void host_integer_radix_bitnot_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, int_bitop_buffer<Torus> *mem_ptr,
void **bsks, Torus **ksks, uint32_t num_radix_blocks) {
auto lut = mem_ptr->lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks, ksks,
stream, lwe_array_out, lwe_array_1, lwe_array_2, bsk, ksk,
num_radix_blocks, lut);
}
template <typename Torus>
__host__ void scratch_cuda_integer_radix_bitop_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_bitop_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
int_radix_params params, BITOP_TYPE op, bool allocate_gpu_memory) {
__host__ void
host_integer_radix_bitnot_kb(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_in,
int_bitop_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
*mem_ptr =
new int_bitop_buffer<Torus>(streams, gpu_indexes, gpu_count, op, params,
num_radix_blocks, allocate_gpu_memory);
auto lut = mem_ptr->lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, lwe_array_out, lwe_array_in, bsk, ksk, num_radix_blocks, lut);
}
template <typename Torus>
__host__ void scratch_cuda_integer_radix_bitop_kb(
cuda_stream_t *stream, int_bitop_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
bool allocate_gpu_memory) {
*mem_ptr = new int_bitop_buffer<Torus>(stream, op, params, num_radix_blocks,
allocate_gpu_memory);
}
#endif

View File

@@ -1,13 +1,12 @@
#include "integer/cmux.cuh"
void scratch_cuda_integer_radix_cmux_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -18,33 +17,29 @@ void scratch_cuda_integer_radix_cmux_kb_64(
[](uint64_t x) -> uint64_t { return x == 1; };
scratch_cuda_integer_radix_cmux_kb(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
stream, (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
lwe_ciphertext_count, params, allocate_gpu_memory);
}
void cuda_cmux_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_condition, void *lwe_array_true,
void *lwe_array_false, int8_t *mem_ptr, void **bsks, void **ksks,
uint32_t lwe_ciphertext_count) {
cuda_stream_t *stream, void *lwe_array_out, void *lwe_condition,
void *lwe_array_true, void *lwe_array_false, int8_t *mem_ptr, void *bsk,
void *ksk, uint32_t lwe_ciphertext_count) {
host_integer_radix_cmux_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_condition),
static_cast<uint64_t *>(lwe_array_true),
static_cast<uint64_t *>(lwe_array_false),
(int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
(int_cmux_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
lwe_ciphertext_count);
}
void cleanup_cuda_integer_radix_cmux(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count,
void cleanup_cuda_integer_radix_cmux(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
int_cmux_buffer<uint64_t> *mem_ptr =
(int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(stream);
}

View File

@@ -5,13 +5,11 @@
#include <omp.h>
template <typename Torus>
__host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
__host__ void zero_out_if(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_input, Torus *lwe_condition,
int_zero_out_if_buffer<Torus> *mem_ptr,
int_radix_lut<Torus> *predicate, void **bsks,
Torus **ksks, uint32_t num_radix_blocks) {
cudaSetDevice(gpu_indexes[0]);
int_radix_lut<Torus> *predicate, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
auto params = mem_ptr->params;
int big_lwe_size = params.big_lwe_dimension + 1;
@@ -28,34 +26,34 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
lwe_array_out_block, predicate->lwe_indexes_in, lwe_array_input_block,
lwe_condition, predicate->lwe_indexes_in, params.big_lwe_dimension,
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0,
stream->stream>>>(
lwe_array_out_block, lwe_array_input_block, lwe_condition,
predicate->lwe_indexes, params.big_lwe_dimension,
params.message_modulus, 1);
check_cuda_error(cudaGetLastError());
}
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsks,
ksks, num_radix_blocks, predicate);
stream, lwe_array_out, tmp_lwe_array_input, bsk, ksk, num_radix_blocks,
predicate);
}
template <typename Torus>
__host__ void host_integer_radix_cmux_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_condition, Torus *lwe_array_true,
Torus *lwe_array_false, int_cmux_buffer<Torus> *mem_ptr, void **bsks,
Torus **ksks, uint32_t num_radix_blocks) {
__host__ void
host_integer_radix_cmux_kb(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_condition, Torus *lwe_array_true,
Torus *lwe_array_false,
int_cmux_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
auto params = mem_ptr->params;
// Since our CPU threads will be working on different streams we shall assert
// the work in the main stream is completed
auto true_streams = mem_ptr->zero_if_true_buffer->true_streams;
auto false_streams = mem_ptr->zero_if_false_buffer->false_streams;
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
stream->synchronize();
auto true_stream = mem_ptr->zero_if_true_buffer->local_stream;
auto false_stream = mem_ptr->zero_if_false_buffer->local_stream;
#pragma omp parallel sections
{
@@ -63,46 +61,40 @@ __host__ void host_integer_radix_cmux_kb(
#pragma omp section
{
auto mem_true = mem_ptr->zero_if_true_buffer;
zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
lwe_array_true, lwe_condition, mem_true,
mem_ptr->inverted_predicate_lut, bsks, ksks,
num_radix_blocks);
zero_out_if(true_stream, mem_ptr->tmp_true_ct, lwe_array_true,
lwe_condition, mem_true, mem_ptr->inverted_predicate_lut, bsk,
ksk, num_radix_blocks);
}
#pragma omp section
{
auto mem_false = mem_ptr->zero_if_false_buffer;
zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
lwe_array_false, lwe_condition, mem_false,
mem_ptr->predicate_lut, bsks, ksks, num_radix_blocks);
zero_out_if(false_stream, mem_ptr->tmp_false_ct, lwe_array_false,
lwe_condition, mem_false, mem_ptr->predicate_lut, bsk, ksk,
num_radix_blocks);
}
}
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
cuda_synchronize_stream(false_streams[j], gpu_indexes[j]);
}
cuda_synchronize_stream(true_stream);
cuda_synchronize_stream(false_stream);
// If the condition was true, true_ct will have kept its value and false_ct
// will be 0 If the condition was false, true_ct will be 0 and false_ct will
// have kept its value
auto added_cts = mem_ptr->tmp_true_ct;
host_addition(streams[0], gpu_indexes[0], added_cts, mem_ptr->tmp_true_ct,
mem_ptr->tmp_false_ct, params.big_lwe_dimension,
num_radix_blocks);
host_addition(stream, added_cts, mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
params.big_lwe_dimension, num_radix_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
num_radix_blocks, mem_ptr->message_extract_lut);
stream, lwe_array_out, added_cts, bsk, ksk, num_radix_blocks,
mem_ptr->message_extract_lut);
}
template <typename Torus>
__host__ void scratch_cuda_integer_radix_cmux_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_cmux_buffer<Torus> **mem_ptr,
cuda_stream_t *stream, int_cmux_buffer<Torus> **mem_ptr,
std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
int_radix_params params, bool allocate_gpu_memory) {
*mem_ptr = new int_cmux_buffer<Torus>(streams, gpu_indexes, gpu_count,
predicate_lut_f, params,
*mem_ptr = new int_cmux_buffer<Torus>(stream, predicate_lut_f, params,
num_radix_blocks, allocate_gpu_memory);
}
#endif

View File

@@ -1,13 +1,13 @@
#include "integer/comparison.cuh"
void scratch_cuda_integer_radix_comparison_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory) {
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, COMPARISON_TYPE op_type,
bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -17,10 +17,9 @@ void scratch_cuda_integer_radix_comparison_kb_64(
switch (op_type) {
case EQ:
case NE:
scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params,
op_type, false, allocate_gpu_memory);
scratch_cuda_integer_radix_equality_check_kb<uint64_t>(
stream, (int_comparison_buffer<uint64_t> **)mem_ptr,
lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
break;
case GT:
case GE:
@@ -28,18 +27,17 @@ void scratch_cuda_integer_radix_comparison_kb_64(
case LE:
case MAX:
case MIN:
scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params,
op_type, is_signed, allocate_gpu_memory);
scratch_cuda_integer_radix_difference_check_kb<uint64_t>(
stream, (int_comparison_buffer<uint64_t> **)mem_ptr,
lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
break;
}
}
void cuda_comparison_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
void **bsks, void **ksks, uint32_t num_radix_blocks) {
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
uint32_t lwe_ciphertext_count) {
int_comparison_buffer<uint64_t> *buffer =
(int_comparison_buffer<uint64_t> *)mem_ptr;
@@ -47,43 +45,39 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
case EQ:
case NE:
host_integer_radix_equality_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_1),
static_cast<uint64_t *>(lwe_array_2), buffer, bsks, (uint64_t **)(ksks),
num_radix_blocks);
static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
static_cast<uint64_t *>(ksk), lwe_ciphertext_count);
break;
case GT:
case GE:
case LT:
case LE:
host_integer_radix_difference_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_1),
static_cast<uint64_t *>(lwe_array_2), buffer,
buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
num_radix_blocks);
buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
lwe_ciphertext_count);
break;
case MAX:
case MIN:
host_integer_radix_maxmin_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_1),
static_cast<uint64_t *>(lwe_array_2), buffer, bsks, (uint64_t **)(ksks),
num_radix_blocks);
static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
static_cast<uint64_t *>(ksk), lwe_ciphertext_count);
break;
default:
PANIC("Cuda error: integer operation not supported")
PANIC("Cuda error: integer operation not supported");
}
}
void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count,
void cleanup_cuda_integer_comparison(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
int_comparison_buffer<uint64_t> *mem_ptr =
(int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(stream);
}

View File

@@ -8,8 +8,8 @@
#include "integer/cmux.cuh"
#include "integer/negation.cuh"
#include "integer/scalar_addition.cuh"
#include "pbs/programmable_bootstrap_classic.cuh"
#include "pbs/programmable_bootstrap_multibit.cuh"
#include "pbs/bootstrap_low_latency.cuh"
#include "pbs/bootstrap_multibit.cuh"
#include "types/complex/operations.cuh"
#include "utils/kernel_dimensions.cuh"
@@ -33,17 +33,15 @@ __global__ void device_accumulate_all_blocks(Torus *output, Torus *input_block,
}
template <typename Torus>
__host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
Torus *output, Torus *input,
uint32_t lwe_dimension,
__host__ void accumulate_all_blocks(cuda_stream_t *stream, Torus *output,
Torus *input, uint32_t lwe_dimension,
uint32_t num_radix_blocks) {
cudaSetDevice(gpu_index);
int num_blocks = 0, num_threads = 0;
int num_entries = (lwe_dimension + 1);
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
// Add all blocks and store in sum
device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream>>>(
device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
output, input, lwe_dimension, num_radix_blocks);
check_cuda_error(cudaGetLastError());
}
@@ -56,13 +54,12 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
*
*/
template <typename Torus>
__host__ void are_all_comparisons_block_true(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
uint32_t num_radix_blocks) {
__host__ void
are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
cudaSetDevice(gpu_indexes[0]);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto glwe_dimension = params.glwe_dimension;
@@ -72,32 +69,28 @@ __host__ void are_all_comparisons_block_true(
auto are_all_block_true_buffer =
mem_ptr->eq_buffer->are_all_block_true_buffer;
auto tmp_out = are_all_block_true_buffer->tmp_out;
uint32_t total_modulus = message_modulus * carry_modulus;
uint32_t max_value = total_modulus - 1;
cuda_memcpy_async_gpu_to_gpu(tmp_out, lwe_array_in,
num_radix_blocks * (big_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(
lwe_array_out, lwe_array_in,
num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
int lut_num_blocks = 0;
uint32_t remaining_blocks = num_radix_blocks;
while (remaining_blocks > 0) {
while (remaining_blocks > 1) {
// Split in max_value chunks
uint32_t chunk_length = std::min(max_value, remaining_blocks);
int num_chunks = remaining_blocks / chunk_length;
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
// as in the worst case we will be adding `max_value` ones
auto input_blocks = tmp_out;
auto input_blocks = lwe_array_out;
auto accumulator = are_all_block_true_buffer->tmp_block_accumulated;
auto is_equal_to_num_blocks_map =
&are_all_block_true_buffer->is_equal_to_lut_map;
for (int i = 0; i < num_chunks; i++) {
accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
input_blocks, big_lwe_dimension, chunk_length);
accumulate_all_blocks(stream, accumulator, input_blocks,
big_lwe_dimension, chunk_length);
accumulator += (big_lwe_dimension + 1);
remaining_blocks -= (chunk_length - 1);
@@ -110,45 +103,29 @@ __host__ void are_all_comparisons_block_true(
if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
// is_non_zero_lut_buffer LUT
lut = mem_ptr->eq_buffer->is_non_zero_lut;
} else if (chunk_length == max_value) {
// is_max_value LUT
lut = are_all_block_true_buffer->is_max_value_lut;
} else {
if ((*is_equal_to_num_blocks_map).find(chunk_length) !=
(*is_equal_to_num_blocks_map).end()) {
// The LUT is already computed
lut = (*is_equal_to_num_blocks_map)[chunk_length];
} else {
// LUT needs to be computed
auto new_lut =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
max_value, num_radix_blocks, true);
// is_equal_to_num_blocks LUT
lut = are_all_block_true_buffer->is_equal_to_num_blocks_lut;
if (chunk_length != lut_num_blocks) {
auto is_equal_to_num_blocks_lut_f = [max_value,
chunk_length](Torus x) -> Torus {
return (x & max_value) == chunk_length;
};
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], new_lut->get_lut(gpu_indexes[0], 0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
is_equal_to_num_blocks_lut_f);
stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
carry_modulus, is_equal_to_num_blocks_lut_f);
new_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
(*is_equal_to_num_blocks_map)[chunk_length] = new_lut;
lut = new_lut;
// We don't have to generate this lut again
lut_num_blocks = chunk_length;
}
}
// Applies the LUT
if (remaining_blocks == 1) {
// In the last iteration we copy the output to the final address
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
ksks, 1, lut);
return;
} else {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks,
num_chunks, lut);
}
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, lwe_array_out, accumulator, bsk, ksk, num_chunks, lut);
}
}
@@ -160,12 +137,9 @@ __host__ void are_all_comparisons_block_true(
*/
template <typename Torus>
__host__ void is_at_least_one_comparisons_block_true(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
uint32_t num_radix_blocks) {
cudaSetDevice(gpu_indexes[0]);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
@@ -176,24 +150,23 @@ __host__ void is_at_least_one_comparisons_block_true(
uint32_t total_modulus = message_modulus * carry_modulus;
uint32_t max_value = total_modulus - 1;
cuda_memcpy_async_gpu_to_gpu(mem_ptr->tmp_lwe_array_out, lwe_array_in,
num_radix_blocks * (big_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(
lwe_array_out, lwe_array_in,
num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
uint32_t remaining_blocks = num_radix_blocks;
while (remaining_blocks > 0) {
while (remaining_blocks > 1) {
// Split in max_value chunks
uint32_t chunk_length = std::min(max_value, remaining_blocks);
int num_chunks = remaining_blocks / chunk_length;
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
// as in the worst case we will be adding `max_value` ones
auto input_blocks = mem_ptr->tmp_lwe_array_out;
auto input_blocks = lwe_array_out;
auto accumulator = buffer->tmp_block_accumulated;
for (int i = 0; i < num_chunks; i++) {
accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
input_blocks, big_lwe_dimension, chunk_length);
accumulate_all_blocks(stream, accumulator, input_blocks,
big_lwe_dimension, chunk_length);
accumulator += (big_lwe_dimension + 1);
remaining_blocks -= (chunk_length - 1);
@@ -205,17 +178,8 @@ __host__ void is_at_least_one_comparisons_block_true(
int_radix_lut<Torus> *lut = mem_ptr->eq_buffer->is_non_zero_lut;
// Applies the LUT
if (remaining_blocks == 1) {
// In the last iteration we copy the output to the final address
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
ksks, 1, lut);
return;
} else {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
accumulator, bsks, ksks, num_chunks, lut);
}
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, lwe_array_out, accumulator, bsk, ksk, num_chunks, lut);
}
}
@@ -240,12 +204,10 @@ __host__ void is_at_least_one_comparisons_block_true(
// are_all_comparisons_block_true
template <typename Torus>
__host__ void host_compare_with_zero_equality(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
cudaSetDevice(gpu_indexes[0]);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
@@ -270,8 +232,7 @@ __host__ void host_compare_with_zero_equality(
if (num_radix_blocks == 1) {
// Just copy
cuda_memcpy_async_gpu_to_gpu(sum, lwe_array_in, big_lwe_size_bytes,
streams[0], gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(sum, lwe_array_in, big_lwe_size_bytes, stream);
num_sum_blocks = 1;
} else {
uint32_t remainder_blocks = num_radix_blocks;
@@ -281,57 +242,76 @@ __host__ void host_compare_with_zero_equality(
uint32_t chunk_size =
std::min(remainder_blocks, num_elements_to_fill_carry);
accumulate_all_blocks(streams[0], gpu_indexes[0], sum_i, chunk,
big_lwe_dimension, chunk_size);
accumulate_all_blocks(stream, sum_i, chunk, big_lwe_dimension,
chunk_size);
num_sum_blocks++;
remainder_blocks -= (chunk_size - 1);
// Update operands
chunk += (chunk_size - 1) * big_lwe_size;
chunk += chunk_size * big_lwe_size;
sum_i += big_lwe_size;
}
}
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks,
zero_comparison);
are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
sum, mem_ptr, bsks, ksks, num_sum_blocks);
stream, sum, sum, bsk, ksk, num_sum_blocks, zero_comparison);
are_all_comparisons_block_true(stream, lwe_array_out, sum, mem_ptr, bsk, ksk,
num_sum_blocks);
// The result will be in the two first block. Everything else is
// garbage.
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
big_lwe_size_bytes * (num_radix_blocks - 1), stream);
}
template <typename Torus>
__host__ void host_integer_radix_equality_check_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2,
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
uint32_t num_radix_blocks) {
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
Torus *lwe_array_2, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
auto eq_buffer = mem_ptr->eq_buffer;
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
// Applies the LUT for the comparison operation
auto comparisons = mem_ptr->tmp_block_comparisons;
integer_radix_apply_bivariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
bsks, ksks, num_radix_blocks, eq_buffer->operator_lut,
eq_buffer->operator_lut->params.message_modulus);
stream, comparisons, lwe_array_1, lwe_array_2, bsk, ksk, num_radix_blocks,
eq_buffer->operator_lut);
// This takes a Vec of blocks, where each block is either 0 or 1.
//
// It returns a block encrypting 1 if all input blocks are 1
// It return a block encrypting 1 if all input blocks are 1
// otherwise the block encrypts 0
are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
comparisons, mem_ptr, bsks, ksks,
num_radix_blocks);
are_all_comparisons_block_true(stream, lwe_array_out, comparisons, mem_ptr,
bsk, ksk, num_radix_blocks);
// Zero all blocks but the first
size_t big_lwe_size = big_lwe_dimension + 1;
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
big_lwe_size_bytes * (num_radix_blocks - 1), stream);
}
template <typename Torus>
__host__ void scratch_cuda_integer_radix_equality_check_kb(
cuda_stream_t *stream, int_comparison_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
bool allocate_gpu_memory) {
*mem_ptr = new int_comparison_buffer<Torus>(
stream, op, params, num_radix_blocks, allocate_gpu_memory);
}
template <typename Torus>
__host__ void
compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_left, Torus *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr, void **bsks,
Torus **ksks, uint32_t num_radix_blocks) {
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
@@ -353,21 +333,21 @@ compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
// Subtract
// Here we need the true lwe sub, not the one that comes from shortint.
host_subtraction(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_left,
lwe_array_right, big_lwe_dimension, num_radix_blocks);
host_subtraction(stream, lwe_array_out, lwe_array_left, lwe_array_right,
big_lwe_dimension, num_radix_blocks);
// Apply LUT to compare to 0
auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
num_radix_blocks, is_non_zero_lut);
stream, lwe_array_out, lwe_array_out, bsk, ksk, num_radix_blocks,
is_non_zero_lut);
// Add one
// Here Lhs can have the following values: (-1) % (message modulus * carry
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
host_integer_radix_add_scalar_one_inplace(
streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
num_radix_blocks, message_modulus, carry_modulus);
host_integer_radix_add_scalar_one_inplace(stream, lwe_array_out,
big_lwe_dimension, num_radix_blocks,
message_modulus, carry_modulus);
}
// Reduces a vec containing shortint blocks that encrypts a sign
@@ -375,12 +355,11 @@ compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
// final sign
template <typename Torus>
__host__ void
tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_block_comparisons,
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
std::function<Torus(Torus)> sign_handler_f, void **bsks,
Torus **ksks, uint32_t num_radix_blocks) {
std::function<Torus(Torus)> sign_handler_f, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
auto params = tree_buffer->params;
auto big_lwe_dimension = params.big_lwe_dimension;
@@ -400,19 +379,16 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
auto y = tree_buffer->tmp_y;
if (x != lwe_block_comparisons)
cuda_memcpy_async_gpu_to_gpu(x, lwe_block_comparisons,
big_lwe_size_bytes * num_radix_blocks,
streams[0], gpu_indexes[0]);
big_lwe_size_bytes * num_radix_blocks, stream);
uint32_t partial_block_count = num_radix_blocks;
auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
while (partial_block_count > 2) {
pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
partial_block_count, 4);
pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
partial_block_count >> 1, inner_tree_leaf);
stream, x, y, bsk, ksk, partial_block_count >> 1, inner_tree_leaf);
if ((partial_block_count % 2) != 0) {
partial_block_count >>= 1;
@@ -422,8 +398,7 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
auto last_x_block = x + (partial_block_count - 1) * big_lwe_size;
cuda_memcpy_async_gpu_to_gpu(last_x_block, last_y_block,
big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
big_lwe_size_bytes, stream);
} else {
partial_block_count >>= 1;
}
@@ -434,8 +409,7 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
std::function<Torus(Torus)> f;
if (partial_block_count == 2) {
pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
partial_block_count, 4);
pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
int msb = (x >> 2) & 3;
@@ -449,61 +423,52 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
y = x;
f = sign_handler_f;
}
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], last_lut->get_lut(gpu_indexes[0], 0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus, f);
last_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
generate_device_accumulator<Torus>(stream, last_lut->lut, glwe_dimension,
polynomial_size, message_modulus,
carry_modulus, f);
// Last leaf
integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
gpu_count, lwe_array_out, y,
bsks, ksks, 1, last_lut);
integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out, y, bsk,
ksk, 1, last_lut);
}
template <typename Torus>
__host__ void host_integer_radix_difference_check_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_left, Torus *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> reduction_lut_f, void **bsks, Torus **ksks,
uint32_t num_radix_blocks) {
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_left,
Torus *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> reduction_lut_f, void *bsk, Torus *ksk,
uint32_t total_num_radix_blocks) {
auto diff_buffer = mem_ptr->diff_buffer;
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
uint32_t packed_num_radix_blocks = num_radix_blocks;
uint32_t num_radix_blocks = total_num_radix_blocks;
auto lhs = lwe_array_left;
auto rhs = lwe_array_right;
if (carry_modulus >= message_modulus) {
if (carry_modulus == message_modulus) {
// Packing is possible
// Pack inputs
Torus *packed_left = diff_buffer->tmp_packed_left;
Torus *packed_right = diff_buffer->tmp_packed_right;
// In case the ciphertext is signed, the sign block and the one before it
// are handled separately
if (mem_ptr->is_signed) {
packed_num_radix_blocks -= 2;
}
pack_blocks(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
big_lwe_dimension, packed_num_radix_blocks, message_modulus);
pack_blocks(streams[0], gpu_indexes[0], packed_right, lwe_array_right,
big_lwe_dimension, packed_num_radix_blocks, message_modulus);
pack_blocks(stream, packed_left, lwe_array_left, big_lwe_dimension,
num_radix_blocks, message_modulus);
pack_blocks(stream, packed_right, lwe_array_right, big_lwe_dimension,
num_radix_blocks, message_modulus);
// From this point we have half number of blocks
packed_num_radix_blocks /= 2;
num_radix_blocks /= 2;
// Clean noise
auto identity_lut = mem_ptr->identity_lut;
auto cleaning_lut = mem_ptr->cleaning_lut;
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks,
packed_num_radix_blocks, identity_lut);
stream, packed_left, packed_left, bsk, ksk, num_radix_blocks,
cleaning_lut);
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, packed_right, packed_right, bsks, ksks,
packed_num_radix_blocks, identity_lut);
stream, packed_right, packed_right, bsk, ksk, num_radix_blocks,
cleaning_lut);
lhs = packed_left;
rhs = packed_right;
@@ -514,104 +479,49 @@ __host__ void host_integer_radix_difference_check_kb(
// - 1 if lhs == rhs
// - 2 if lhs > rhs
auto comparisons = mem_ptr->tmp_block_comparisons;
auto num_comparisons = 0;
if (!mem_ptr->is_signed) {
// Compare packed blocks, or simply the total number of radix blocks in the
// inputs
compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
rhs, mem_ptr, bsks, ksks, packed_num_radix_blocks);
num_comparisons = packed_num_radix_blocks;
} else {
// Packing is possible
if (carry_modulus >= message_modulus) {
// Compare (num_radix_blocks - 2) / 2 packed blocks
compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
rhs, mem_ptr, bsks, ksks,
packed_num_radix_blocks);
// Compare the last block before the sign block separately
auto identity_lut = mem_ptr->identity_lut;
Torus *last_left_block_before_sign_block =
diff_buffer->tmp_packed_left + packed_num_radix_blocks * big_lwe_size;
Torus *last_right_block_before_sign_block =
diff_buffer->tmp_packed_right +
packed_num_radix_blocks * big_lwe_size;
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1,
identity_lut);
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, last_right_block_before_sign_block,
lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks,
1, identity_lut);
compare_radix_blocks_kb(
streams, gpu_indexes, gpu_count,
comparisons + packed_num_radix_blocks * big_lwe_size,
last_left_block_before_sign_block, last_right_block_before_sign_block,
mem_ptr, bsks, ksks, 1);
// Compare the sign block separately
integer_radix_apply_bivariate_lookup_table_kb(
streams, gpu_indexes, gpu_count,
comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsks, ksks,
1, mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
num_comparisons = packed_num_radix_blocks + 2;
} else {
compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
lwe_array_left, lwe_array_right, mem_ptr, bsks,
ksks, num_radix_blocks - 1);
// Compare the sign block separately
integer_radix_apply_bivariate_lookup_table_kb(
streams, gpu_indexes, gpu_count,
comparisons + (num_radix_blocks - 1) * big_lwe_size,
lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsks, ksks,
1, mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
num_comparisons = num_radix_blocks;
}
}
compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk, ksk,
num_radix_blocks);
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
comparisons, mem_ptr->diff_buffer->tree_buffer,
reduction_lut_f, bsks, ksks, num_comparisons);
tree_sign_reduction(stream, lwe_array_out, comparisons,
mem_ptr->diff_buffer->tree_buffer, reduction_lut_f, bsk,
ksk, num_radix_blocks);
// The result will be in the first block. Everything else is garbage.
size_t big_lwe_size = big_lwe_dimension + 1;
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
(total_num_radix_blocks - 1) * big_lwe_size_bytes, stream);
}
template <typename Torus>
__host__ void scratch_cuda_integer_radix_comparison_check_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_comparison_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
int_radix_params params, COMPARISON_TYPE op, bool is_signed,
__host__ void scratch_cuda_integer_radix_difference_check_kb(
cuda_stream_t *stream, int_comparison_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
bool allocate_gpu_memory) {
*mem_ptr = new int_comparison_buffer<Torus>(streams, gpu_indexes, gpu_count,
op, params, num_radix_blocks,
is_signed, allocate_gpu_memory);
*mem_ptr = new int_comparison_buffer<Torus>(
stream, op, params, num_radix_blocks, allocate_gpu_memory);
}
template <typename Torus>
__host__ void
host_integer_radix_maxmin_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
host_integer_radix_maxmin_kb(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_left, Torus *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr, void **bsks,
Torus **ksks, uint32_t total_num_radix_blocks) {
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t total_num_radix_blocks) {
// Compute the sign
host_integer_radix_difference_check_kb(
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsks,
ksks, total_num_radix_blocks);
stream, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
mem_ptr, mem_ptr->cleaning_lut_f, bsk, ksk, total_num_radix_blocks);
// Selector
host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
mem_ptr->tmp_lwe_array_out, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
total_num_radix_blocks);
host_integer_radix_cmux_kb(
stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
}
#endif

View File

@@ -1,85 +0,0 @@
#include "integer/div_rem.cuh"
void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);
scratch_cuda_integer_div_rem_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_div_rem_memory<uint64_t> **)mem_ptr, num_blocks, params,
allocate_gpu_memory);
}
void cuda_integer_div_rem_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *quotient,
void *remainder, void *numerator, void *divisor, int8_t *mem_ptr,
void **bsks, void **ksks, uint32_t num_blocks) {
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
switch (mem->params.polynomial_size) {
case 512:
host_integer_div_rem_kb<uint64_t, Degree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsks, (uint64_t **)(ksks), mem, num_blocks);
break;
case 1024:
host_integer_div_rem_kb<uint64_t, Degree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsks, (uint64_t **)(ksks), mem, num_blocks);
break;
case 2048:
host_integer_div_rem_kb<uint64_t, Degree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsks, (uint64_t **)(ksks), mem, num_blocks);
break;
case 4096:
host_integer_div_rem_kb<uint64_t, Degree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsks, (uint64_t **)(ksks), mem, num_blocks);
break;
case 8192:
host_integer_div_rem_kb<uint64_t, Degree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsks, (uint64_t **)(ksks), mem, num_blocks);
break;
case 16384:
host_integer_div_rem_kb<uint64_t, Degree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsks, (uint64_t **)(ksks), mem, num_blocks);
break;
default:
PANIC("Cuda error (integer div_rem): unsupported polynomial size. "
"Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
}
}
void cleanup_cuda_integer_div_rem(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void) {
int_div_rem_memory<uint64_t> *mem_ptr =
(int_div_rem_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}

View File

@@ -1,641 +0,0 @@
#ifndef TFHE_RS_DIV_REM_CUH
#define TFHE_RS_DIV_REM_CUH
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.h"
#include "integer/comparison.cuh"
#include "integer/integer.cuh"
#include "integer/negation.cuh"
#include "integer/scalar_shifts.cuh"
#include "linear_algebra.h"
#include "programmable_bootstrap.h"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <fstream>
#include <iostream>
#include <omp.h>
#include <sstream>
#include <string>
#include <vector>
int ceil_div(int a, int b) { return (a + b - 1) / b; }
// struct makes it easier to use list of ciphertexts and move data between them
// struct does not allocate or drop any memory,
// keeps track on number of ciphertexts inside list.
template <typename Torus> struct lwe_ciphertext_list {
Torus *data;
size_t max_blocks;
size_t len;
int_radix_params params;
size_t big_lwe_size;
size_t radix_size;
size_t big_lwe_size_bytes;
size_t radix_size_bytes;
size_t big_lwe_dimension;
lwe_ciphertext_list(Torus *src, int_radix_params params, size_t max_blocks)
: data(src), params(params), max_blocks(max_blocks) {
big_lwe_size = params.big_lwe_dimension + 1;
big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
radix_size = max_blocks * big_lwe_size;
radix_size_bytes = radix_size * sizeof(Torus);
big_lwe_dimension = params.big_lwe_dimension;
len = max_blocks;
}
// copies ciphertexts from Torus*, starting from `starting_block` including
// `finish_block`, does not change the value of self len
void copy_from(Torus *src, size_t start_block, size_t finish_block,
cudaStream_t stream, uint32_t gpu_index) {
size_t tmp_len = finish_block - start_block + 1;
cuda_memcpy_async_gpu_to_gpu(data, &src[start_block * big_lwe_size],
tmp_len * big_lwe_size_bytes, stream,
gpu_index);
}
// copies ciphertexts from lwe_ciphertext_list, starting from `starting_block`
// including `finish_block`, does not change the value of self len
void copy_from(const lwe_ciphertext_list &src, size_t start_block,
size_t finish_block, cudaStream_t stream, uint32_t gpu_index) {
copy_from(src.data, start_block, finish_block, stream, gpu_index);
}
// copies ciphertexts from Torus*, starting from `starting_block`
// including `finish_block`, updating the value of self len
void clone_from(Torus *src, size_t start_block, size_t finish_block,
cudaStream_t stream, uint32_t gpu_index) {
len = finish_block - start_block + 1;
cuda_memcpy_async_gpu_to_gpu(data, &src[start_block * big_lwe_size],
len * big_lwe_size_bytes, stream, gpu_index);
}
// copies ciphertexts from ciphertexts_list, starting from `starting_block`
// including `finish_block`, updating the value of self len
void clone_from(const lwe_ciphertext_list &src, size_t start_block,
size_t finish_block, cudaStream_t stream,
uint32_t gpu_index) {
clone_from(src.data, start_block, finish_block, stream, gpu_index);
}
// assign zero to blocks starting from `start_block` including `finish_block`
void assign_zero(size_t start_block, size_t finish_block, cudaStream_t stream,
uint32_t gpu_index) {
auto size = finish_block - start_block + 1;
cuda_memset_async(&data[start_block * big_lwe_size], 0,
size * big_lwe_size_bytes, stream, gpu_index);
}
// return pointer to last block
Torus *last_block() { return &data[(len - 1) * big_lwe_size]; }
// return pointer to first_block
Torus *first_block() { return data; }
// return block with `index`
Torus *get_block(size_t index) {
assert(index < len);
return &data[index * big_lwe_size];
}
bool is_empty() { return len == 0; }
// does not dop actual memory from `data`, only reduces value of `len` by one
void pop() {
if (len > 0)
len--;
else
assert(len > 0);
}
// insert ciphertext at index `ind`
void insert(size_t ind, Torus *ciphertext_block, cudaStream_t stream,
uint32_t gpu_index) {
assert(ind <= len);
assert(len < max_blocks);
size_t insert_offset = ind * big_lwe_size;
for (size_t i = len; i > ind; i--) {
Torus *src = &data[(i - 1) * big_lwe_size];
Torus *dst = &data[i * big_lwe_size];
cuda_memcpy_async_gpu_to_gpu(dst, src, big_lwe_size_bytes, stream,
gpu_index);
}
cuda_memcpy_async_gpu_to_gpu(&data[insert_offset], ciphertext_block,
big_lwe_size_bytes, stream, gpu_index);
len++;
}
// push ciphertext at the end of `data`
void push(Torus *ciphertext_block, cudaStream_t stream, uint32_t gpu_index) {
assert(len < max_blocks);
size_t offset = len * big_lwe_size;
cuda_memcpy_async_gpu_to_gpu(&data[offset], ciphertext_block,
big_lwe_size_bytes, stream, gpu_index);
len++;
}
// duplicate ciphertext into `number_of_blocks` ciphertexts
void fill_with_same_ciphertext(Torus *ciphertext, size_t number_of_blocks,
cudaStream_t stream, uint32_t gpu_index) {
assert(number_of_blocks <= max_blocks);
for (size_t i = 0; i < number_of_blocks; i++) {
Torus *dest = &data[i * big_lwe_size];
cuda_memcpy_async_gpu_to_gpu(dest, ciphertext, big_lwe_size_bytes, stream,
gpu_index);
}
len = number_of_blocks;
}
// used for debugging, prints body of each ciphertext.
void print_blocks_body(const char *name) {
for (int i = 0; i < len; i++) {
print_debug(name, &data[i * big_lwe_size + big_lwe_dimension], 1);
}
}
};
template <typename Torus>
__host__ void scratch_cuda_integer_div_rem_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_div_rem_memory<Torus> **mem_ptr, uint32_t num_blocks,
int_radix_params params, bool allocate_gpu_memory) {
*mem_ptr = new int_div_rem_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
}
template <typename Torus, class params>
__host__ void
host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *quotient, Torus *remainder,
Torus *numerator, Torus *divisor, void **bsks,
uint64_t **ksks, int_div_rem_memory<uint64_t> *mem_ptr,
uint32_t num_blocks) {
auto radix_params = mem_ptr->params;
auto big_lwe_dimension = radix_params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
uint32_t message_modulus = radix_params.message_modulus;
uint32_t carry_modulus = radix_params.carry_modulus;
uint32_t num_bits_in_message = 31 - __builtin_clz(message_modulus);
uint32_t total_bits = num_bits_in_message * num_blocks;
// put temporary buffers in lwe_ciphertext_list for easy use
lwe_ciphertext_list<Torus> remainder1(mem_ptr->remainder1, radix_params,
num_blocks);
lwe_ciphertext_list<Torus> remainder2(mem_ptr->remainder2, radix_params,
num_blocks);
lwe_ciphertext_list<Torus> numerator_block_stack(
mem_ptr->numerator_block_stack, radix_params, num_blocks);
lwe_ciphertext_list<Torus> numerator_block_1(mem_ptr->numerator_block_1,
radix_params, 1);
lwe_ciphertext_list<Torus> tmp_radix(mem_ptr->tmp_radix, radix_params,
num_blocks + 1);
lwe_ciphertext_list<Torus> interesting_remainder1(
mem_ptr->interesting_remainder1, radix_params, num_blocks + 1);
lwe_ciphertext_list<Torus> interesting_remainder2(
mem_ptr->interesting_remainder2, radix_params, num_blocks);
lwe_ciphertext_list<Torus> interesting_divisor(mem_ptr->interesting_divisor,
radix_params, num_blocks);
lwe_ciphertext_list<Torus> divisor_ms_blocks(mem_ptr->divisor_ms_blocks,
radix_params, num_blocks);
lwe_ciphertext_list<Torus> new_remainder(mem_ptr->new_remainder, radix_params,
num_blocks);
lwe_ciphertext_list<Torus> subtraction_overflowed(
mem_ptr->subtraction_overflowed, radix_params, 1);
lwe_ciphertext_list<Torus> did_not_overflow(mem_ptr->did_not_overflow,
radix_params, 1);
lwe_ciphertext_list<Torus> overflow_sum(mem_ptr->overflow_sum, radix_params,
1);
lwe_ciphertext_list<Torus> overflow_sum_radix(mem_ptr->overflow_sum_radix,
radix_params, num_blocks);
lwe_ciphertext_list<Torus> tmp_1(mem_ptr->tmp_1, radix_params, num_blocks);
lwe_ciphertext_list<Torus> at_least_one_upper_block_is_non_zero(
mem_ptr->at_least_one_upper_block_is_non_zero, radix_params, 1);
lwe_ciphertext_list<Torus> cleaned_merged_interesting_remainder(
mem_ptr->cleaned_merged_interesting_remainder, radix_params, num_blocks);
numerator_block_stack.clone_from(numerator, 0, num_blocks - 1, streams[0],
gpu_indexes[0]);
remainder1.assign_zero(0, num_blocks - 1, streams[0], gpu_indexes[0]);
remainder2.assign_zero(0, num_blocks - 1, streams[0], gpu_indexes[0]);
cuda_memset_async(quotient, 0, big_lwe_size_bytes * num_blocks, streams[0],
gpu_indexes[0]);
for (int i = total_bits - 1; i >= 0; i--) {
uint32_t block_of_bit = i / num_bits_in_message;
uint32_t pos_in_block = i % num_bits_in_message;
uint32_t msb_bit_set = total_bits - 1 - i;
uint32_t last_non_trivial_block = msb_bit_set / num_bits_in_message;
// Index to the first block of the remainder that is fully trivial 0
// and all blocks after it are also trivial zeros
// This number is in range 1..=num_bocks -1
uint32_t first_trivial_block = last_non_trivial_block + 1;
interesting_remainder1.clone_from(remainder1, 0, last_non_trivial_block,
streams[0], gpu_indexes[0]);
interesting_remainder2.clone_from(remainder2, 0, last_non_trivial_block,
streams[0], gpu_indexes[0]);
interesting_divisor.clone_from(divisor, 0, last_non_trivial_block,
streams[0], gpu_indexes[0]);
divisor_ms_blocks.clone_from(divisor,
(msb_bit_set + 1) / num_bits_in_message,
num_blocks - 1, streams[0], gpu_indexes[0]);
// We split the divisor at a block position, when in reality the split
// should be at a bit position meaning that potentially (depending on
// msb_bit_set) the split versions share some bits they should not. So we do
// one PBS on the last block of the interesting_divisor, and first block of
// divisor_ms_blocks to trim out bits which should not be there
auto trim_last_interesting_divisor_bits =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
if ((msb_bit_set + 1) % num_bits_in_message == 0) {
return;
}
// The last block of the interesting part of the remainder
// can contain bits which we should not account for
// we have to zero them out.
// Where the msb is set in the block
uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
// e.g 2 bits in message:
// if pos_in_block is 0, then we want to keep only first bit (right
// shift
// mask by 1) if pos_in_block is 1, then we want to keep the two
// bits
// (right shift mask by 0)
uint32_t shift_amount = num_bits_in_message - (pos_in_block + 1);
// Create mask of 1s on the message part, 0s in the carries
uint32_t full_message_mask = message_modulus - 1;
// Shift the mask so that we will only keep bits we should
uint32_t shifted_mask = full_message_mask >> shift_amount;
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
interesting_divisor.last_block(), bsks, ksks, 1,
mem_ptr->masking_luts_1[shifted_mask]);
}; // trim_last_interesting_divisor_bits
auto trim_first_divisor_ms_bits =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
if (divisor_ms_blocks.is_empty() ||
((msb_bit_set + 1) % num_bits_in_message) == 0) {
return;
}
// Where the msb is set in the block
uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
// e.g 2 bits in message:
// if pos_in_block is 0, then we want to discard the first bit (left
// shift mask by 1) if pos_in_block is 1, then we want to discard the
// two bits (left shift mask by 2) let shift_amount =
// num_bits_in_message - pos_in_block
uint32_t shift_amount = pos_in_block + 1;
uint32_t full_message_mask = message_modulus - 1;
uint32_t shifted_mask = full_message_mask << shift_amount;
// Keep the mask within the range of message bits, so that
// the estimated degree of the output is < msg_modulus
shifted_mask = shifted_mask & full_message_mask;
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
divisor_ms_blocks.first_block(), bsks, ksks, 1,
mem_ptr->masking_luts_2[shifted_mask]);
}; // trim_first_divisor_ms_bits
// This does
// R := R << 1; R(0) := N(i)
//
// We could to that by left shifting, R by one, then unchecked_add the
// correct numerator bit.
//
// However, to keep the remainder clean (noise wise), what we do is that we
// put the remainder block from which we need to extract the bit, as the LSB
// of the Remainder, so that left shifting will pull the bit we need.
auto left_shift_interesting_remainder1 =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
numerator_block_1.clone_from(
numerator_block_stack, numerator_block_stack.len - 1,
numerator_block_stack.len - 1, streams[0], gpu_indexes[0]);
numerator_block_stack.pop();
interesting_remainder1.insert(0, numerator_block_1.first_block(),
streams[0], gpu_indexes[0]);
host_integer_radix_logical_scalar_shift_kb_inplace(
streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);
tmp_radix.clone_from(interesting_remainder1, 0,
interesting_remainder1.len - 1, streams[0],
gpu_indexes[0]);
host_radix_blocks_rotate_left(
streams, gpu_indexes, gpu_count, interesting_remainder1.data,
tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size);
numerator_block_1.clone_from(
interesting_remainder1, interesting_remainder1.len - 1,
interesting_remainder1.len - 1, streams[0], gpu_indexes[0]);
interesting_remainder1.pop();
if (pos_in_block != 0) {
// We have not yet extracted all the bits from this numerator
// so, we put it back on the front so that it gets taken next
// iteration
numerator_block_stack.push(numerator_block_1.first_block(),
streams[0], gpu_indexes[0]);
}
}; // left_shift_interesting_remainder1
auto left_shift_interesting_remainder2 =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
host_integer_radix_logical_scalar_shift_kb_inplace(
streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
}; // left_shift_interesting_remainder2
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
#pragma omp parallel sections
{
#pragma omp section
{
// interesting_divisor
trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
gpu_count);
}
#pragma omp section
{
// divisor_ms_blocks
trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes,
gpu_count);
}
#pragma omp section
{
// interesting_remainder1
// numerator_block_stack
left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes,
gpu_count);
}
#pragma omp section
{
// interesting_remainder2
left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
gpu_count);
}
}
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
}
// if interesting_remainder1 != 0 -> interesting_remainder2 == 0
// if interesting_remainder1 == 0 -> interesting_remainder2 != 0
// In practice interesting_remainder1 contains the numerator bit,
// but in that position, interesting_remainder2 always has a 0
auto &merged_interesting_remainder = interesting_remainder1;
host_addition(streams[0], gpu_indexes[0], merged_interesting_remainder.data,
merged_interesting_remainder.data,
interesting_remainder2.data, radix_params.big_lwe_dimension,
merged_interesting_remainder.len);
// after create_clean_version_of_merged_remainder
// `merged_interesting_remainder` will be reused as
// `cleaned_merged_interesting_remainder`
cleaned_merged_interesting_remainder.clone_from(
merged_interesting_remainder, 0, merged_interesting_remainder.len - 1,
streams[0], gpu_indexes[0]);
assert(merged_interesting_remainder.len == interesting_divisor.len);
// `new_remainder` is not initialized yet, so need to set length
new_remainder.len = merged_interesting_remainder.len;
// fills:
// `new_remainder` - radix ciphertext
// `subtraction_overflowed` - single ciphertext
auto do_overflowing_sub = [&](cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count) {
host_integer_overflowing_sub_kb<Torus, params>(
streams, gpu_indexes, gpu_count, new_remainder.data,
subtraction_overflowed.data, merged_interesting_remainder.data,
interesting_divisor.data, bsks, ksks, mem_ptr->overflow_sub_mem,
merged_interesting_remainder.len);
};
// fills:
// `at_least_one_upper_block_is_non_zero` - single ciphertext
auto check_divisor_upper_blocks = [&](cudaStream_t *streams,
uint32_t *gpu_indexes,
uint32_t gpu_count) {
auto &trivial_blocks = divisor_ms_blocks;
if (trivial_blocks.is_empty()) {
cuda_memset_async(at_least_one_upper_block_is_non_zero.first_block(), 0,
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
} else {
// We could call unchecked_scalar_ne
// But we are in the special case where scalar == 0
// So we can skip some stuff
host_compare_with_zero_equality(
streams, gpu_indexes, gpu_count, tmp_1.data, trivial_blocks.data,
mem_ptr->comparison_buffer, bsks, ksks, trivial_blocks.len,
mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
tmp_1.len =
ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1);
is_at_least_one_comparisons_block_true(
streams, gpu_indexes, gpu_count,
at_least_one_upper_block_is_non_zero.data, tmp_1.data,
mem_ptr->comparison_buffer, bsks, ksks, tmp_1.len);
}
};
// Creates a cleaned version (noise wise) of the merged remainder
// so that it can be safely used in bivariate PBSes
// fills:
// `cleaned_merged_interesting_remainder` - radix ciphertext
auto create_clean_version_of_merged_remainder =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count,
cleaned_merged_interesting_remainder.data,
cleaned_merged_interesting_remainder.data, bsks, ksks,
cleaned_merged_interesting_remainder.len,
mem_ptr->message_extract_lut_1);
};
// phase 2
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
#pragma omp parallel sections
{
#pragma omp section
{
// new_remainder
// subtraction_overflowed
do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
}
#pragma omp section
{
// at_least_one_upper_block_is_non_zero
check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes,
gpu_count);
}
#pragma omp section
{
// cleaned_merged_interesting_remainder
create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
gpu_indexes, gpu_count);
}
}
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
}
host_addition(streams[0], gpu_indexes[0], overflow_sum.data,
subtraction_overflowed.data,
at_least_one_upper_block_is_non_zero.data,
radix_params.big_lwe_dimension, 1);
int factor = (i) ? 3 : 2;
int factor_lut_id = factor - 2;
overflow_sum_radix.fill_with_same_ciphertext(
overflow_sum.first_block(), cleaned_merged_interesting_remainder.len,
streams[0], gpu_indexes[0]);
auto conditionally_zero_out_merged_interesting_remainder =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count,
cleaned_merged_interesting_remainder.data,
cleaned_merged_interesting_remainder.data,
overflow_sum_radix.data, bsks, ksks,
cleaned_merged_interesting_remainder.len,
mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
factor);
};
auto conditionally_zero_out_merged_new_remainder =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, new_remainder.data,
new_remainder.data, overflow_sum_radix.data, bsks, ksks,
new_remainder.len,
mem_ptr->zero_out_if_overflow_happened[factor_lut_id], factor);
};
auto set_quotient_bit = [&](cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count) {
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, did_not_overflow.data,
subtraction_overflowed.data,
at_least_one_upper_block_is_non_zero.data, bsks, ksks, 1,
mem_ptr->merge_overflow_flags_luts[pos_in_block],
mem_ptr->merge_overflow_flags_luts[pos_in_block]
->params.message_modulus);
host_addition(streams[0], gpu_indexes[0],
&quotient[block_of_bit * big_lwe_size],
&quotient[block_of_bit * big_lwe_size],
did_not_overflow.data, radix_params.big_lwe_dimension, 1);
};
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
#pragma omp parallel sections
{
#pragma omp section
{
// cleaned_merged_interesting_remainder
conditionally_zero_out_merged_interesting_remainder(
mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
}
#pragma omp section
{
// new_remainder
conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2,
gpu_indexes, gpu_count);
}
#pragma omp section
{
// quotient
set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
}
}
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
}
assert(first_trivial_block - 1 == cleaned_merged_interesting_remainder.len);
assert(first_trivial_block - 1 == new_remainder.len);
remainder1.copy_from(cleaned_merged_interesting_remainder, 0,
first_trivial_block - 1, streams[0], gpu_indexes[0]);
remainder2.copy_from(new_remainder, 0, first_trivial_block - 1, streams[0],
gpu_indexes[0]);
}
assert(remainder1.len == remainder2.len);
// Clean the quotient and remainder
// as even though they have no carries, they are not at nominal noise level
host_addition(streams[0], gpu_indexes[0], remainder, remainder1.data,
remainder2.data, radix_params.big_lwe_dimension,
remainder1.len);
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
#pragma omp parallel sections
{
#pragma omp section
{
integer_radix_apply_univariate_lookup_table_kb(
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
}
#pragma omp section
{
integer_radix_apply_univariate_lookup_table_kb(
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient,
bsks, ksks, num_blocks, mem_ptr->message_extract_lut_2);
}
}
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
}
}
#endif // TFHE_RS_DIV_REM_CUH

View File

@@ -1,165 +1,127 @@
#include "integer/integer.cuh"
#include <linear_algebra.h>
void cuda_full_propagation_64_inplace(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, void *input_blocks,
int8_t *mem_ptr, void **ksks, void **bsks,
uint32_t num_blocks) {
void cuda_full_propagation_64_inplace(
cuda_stream_t *stream, void *input_blocks, int8_t *mem_ptr, void *ksk,
void *bsk, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t ks_base_log, uint32_t ks_level,
uint32_t pbs_base_log, uint32_t pbs_level, uint32_t grouping_factor,
uint32_t num_blocks) {
int_fullprop_buffer<uint64_t> *buffer =
(int_fullprop_buffer<uint64_t> *)mem_ptr;
host_full_propagate_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(input_blocks), buffer, (uint64_t **)(ksks), bsks,
num_blocks);
switch (polynomial_size) {
case 256:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<256>>(
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
case 512:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<512>>(
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
case 1024:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<1024>>(
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
case 2048:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<2048>>(
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
case 4096:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<4096>>(
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
case 8192:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<8192>>(
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
case 16384:
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<16384>>(
stream, static_cast<uint64_t *>(input_blocks),
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
default:
break;
}
}
void scratch_cuda_full_propagation_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus);
scratch_cuda_full_propagation<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count,
(int_fullprop_buffer<uint64_t> **)mem_ptr, params, num_radix_blocks,
stream, (int_fullprop_buffer<uint64_t> **)mem_ptr, lwe_dimension,
glwe_dimension, polynomial_size, level_count, grouping_factor,
input_lwe_ciphertext_count, message_modulus, carry_modulus, pbs_type,
allocate_gpu_memory);
}
void cleanup_cuda_full_propagation(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void) {
void cleanup_cuda_full_propagation(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
int_fullprop_buffer<uint64_t> *mem_ptr =
(int_fullprop_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
cuda_drop_async(mem_ptr->lut_buffer, stream);
cuda_drop_async(mem_ptr->lut_indexes, stream);
cuda_drop_async(mem_ptr->pbs_buffer, stream);
cuda_drop_async(mem_ptr->tmp_small_lwe_vector, stream);
cuda_drop_async(mem_ptr->tmp_big_lwe_vector, stream);
}
void scratch_cuda_propagate_single_carry_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
void scratch_cuda_propagate_single_carry_low_latency_kb_64_inplace(
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);
scratch_cuda_propagate_single_carry_kb_inplace(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
scratch_cuda_propagate_single_carry_low_latency_kb_inplace(
stream, (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
allocate_gpu_memory);
}
void cuda_propagate_single_carry_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
void *carry_out, int8_t *mem_ptr, void **bsks, void **ksks,
uint32_t num_blocks) {
host_propagate_single_carry<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
num_blocks);
void cuda_propagate_single_carry_low_latency_kb_64_inplace(
cuda_stream_t *stream, void *lwe_array, int8_t *mem_ptr, void *bsk,
void *ksk, uint32_t num_blocks) {
host_propagate_single_carry_low_latency<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array),
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsk,
static_cast<uint64_t *>(ksk), num_blocks);
}
void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
void cleanup_cuda_propagate_single_carry_low_latency(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
int_sc_prop_memory<uint64_t> *mem_ptr =
(int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
void scratch_cuda_apply_univariate_lut_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus);
scratch_cuda_apply_univariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
num_radix_blocks, params, allocate_gpu_memory);
}
void cuda_apply_univariate_lut_kb_64(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, void *output_radix_lwe,
void *input_radix_lwe, int8_t *mem_ptr,
void **ksks, void **bsks,
uint32_t num_blocks) {
host_apply_univariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(output_radix_lwe),
static_cast<uint64_t *>(input_radix_lwe),
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks,
num_blocks);
}
void cleanup_cuda_apply_univariate_lut_kb_64(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
void scratch_cuda_apply_bivariate_lut_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus);
scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
num_radix_blocks, params, allocate_gpu_memory);
}
void cuda_apply_bivariate_lut_kb_64(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, void *output_radix_lwe,
void *input_radix_lwe_1,
void *input_radix_lwe_2, int8_t *mem_ptr,
void **ksks, void **bsks,
uint32_t num_blocks, uint32_t shift) {
host_apply_bivariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(output_radix_lwe),
static_cast<uint64_t *>(input_radix_lwe_1),
static_cast<uint64_t *>(input_radix_lwe_2),
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
shift);
}
void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
mem_ptr->release(stream);
}

Some files were not shown because too many files have changed in this diff Show More