Compare commits

..

1 Commits

Author SHA1 Message Date
Guillermo Oyarzun
b5b9b7a032 chore(gpu): add classical pbs params for gpu 2024-06-19 12:31:49 +02:00
711 changed files with 17227 additions and 46049 deletions

View File

@@ -3,8 +3,6 @@ self-hosted-runner:
labels:
- m1mac
- 4090-desktop
- large_windows_16_latest
- large_ubuntu_16
# Configuration variables in array of strings defined in your repository or
# organization. `null` means disabling configuration variables check.
# Empty array means no configuration variable is allowed.

View File

@@ -1,120 +0,0 @@
# Run backward compatibility tests
name: Backward compatibility Tests on CPU
env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
jobs:
setup-instance:
name: Setup instance (backward-compat-tests)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: cpu-small
backward-compat-tests:
name: Backward compatibility tests
needs: [ setup-instance ]
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
with:
toolchain: stable
- name: Install git-lfs
run: |
sudo apt update && sudo apt -y install git-lfs
- name: Use specific data branch
if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
env:
PR_BRANCH: ${{ github.head_ref || github.ref_name }}
run: |
echo "BACKWARD_COMPAT_DATA_BRANCH=${PR_BRANCH}" >> "${GITHUB_ENV}"
- name: Get backward compat branch
id: backward_compat_branch
run: |
BRANCH="$(make backward_compat_branch)"
echo "branch=${BRANCH}" >> "${GITHUB_OUTPUT}"
- name: Clone test data
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
repository: zama-ai/tfhe-backward-compat-data
path: tfhe/tfhe-backward-compat-data
lfs: 'true'
ref: ${{ steps.backward_compat_branch.outputs.branch }}
- name: Run backward compatibility tests
run: |
make test_backward_compatibility_ci
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Backward compatibility tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (backward-compat-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, backward-compat-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (backward-compat-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -1,4 +1,4 @@
# Run a small subset of tests to ensure quick feedback.
# Run a small subset of shortint and integer tests to ensure quick feedback.
name: Fast AWS Tests on CPU
env:
@@ -11,7 +11,6 @@ env:
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -19,112 +18,15 @@ on:
pull_request:
jobs:
should-run:
runs-on: ubuntu-latest
permissions:
pull-requests: write
outputs:
csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
core_crypto_test: ${{ env.IS_PULL_REQUEST == 'false' ||
steps.changed-files.outputs.core_crypto_any_changed ||
steps.changed-files.outputs.dependencies_any_changed }}
boolean_test: ${{ env.IS_PULL_REQUEST == 'false' ||
steps.changed-files.outputs.boolean_any_changed ||
steps.changed-files.outputs.dependencies_any_changed }}
shortint_test: ${{ env.IS_PULL_REQUEST == 'false' ||
steps.changed-files.outputs.shortint_any_changed ||
steps.changed-files.outputs.dependencies_any_changed }}
integer_test: ${{ env.IS_PULL_REQUEST == 'false' ||
steps.changed-files.outputs.integer_any_changed ||
steps.changed-files.outputs.dependencies_any_changed }}
wasm_test: ${{ env.IS_PULL_REQUEST == 'false' ||
steps.changed-files.outputs.wasm_any_changed ||
steps.changed-files.outputs.dependencies_any_changed }}
high_level_api_test: ${{ env.IS_PULL_REQUEST == 'false' ||
steps.changed-files.outputs.high_level_api_any_changed ||
steps.changed-files.outputs.dependencies_any_changed }}
user_docs_test: ${{ env.IS_PULL_REQUEST == 'false' ||
steps.changed-files.outputs.user_docs_any_changed ||
steps.changed-files.outputs.dependencies_any_changed }}
any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |
dependencies:
- tfhe/Cargo.toml
- concrete-csprng/**
- tfhe-zk-pok/**
csprng:
- concrete-csprng/**
zk_pok:
- tfhe-zk-pok/**
core_crypto:
- tfhe/src/core_crypto/**
boolean:
- tfhe/src/core_crypto/**
- tfhe/src/boolean/**
shortint:
- tfhe/src/core_crypto/**
- tfhe/src/shortint/**
integer:
- tfhe/src/core_crypto/**
- tfhe/src/shortint/**
- tfhe/src/integer/**
wasm:
- tfhe/src/**
- tfhe/js_on_wasm_tests/**
- tfhe/web_wasm_parallel_tests/**
- '!tfhe/src/c_api/**'
- '!tfhe/src/boolean/**'
high_level_api:
- tfhe/src/**
- '!tfhe/src/c_api/**'
- '!tfhe/src/boolean/**'
- '!tfhe/src/c_api/**'
- '!tfhe/src/js_on_wasm_api/**'
user_docs:
- tfhe/src/**
- '!tfhe/src/c_api/**'
- 'tfhe/docs/**.md'
- README.md
- name: Aggregate file changes
id: aggregated-changes
if: ( steps.changed-files.outputs.dependencies_any_changed == 'true' ||
steps.changed-files.outputs.csprng_any_changed == 'true' ||
steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
steps.changed-files.outputs.boolean_any_changed == 'true' ||
steps.changed-files.outputs.shortint_any_changed == 'true' ||
steps.changed-files.outputs.integer_any_changed == 'true' ||
steps.changed-files.outputs.wasm_any_changed == 'true' ||
steps.changed-files.outputs.high_level_api_any_changed == 'true' ||
steps.changed-files.outputs.user_docs_any_changed == 'true')
run: |
echo "any_changed=true" >> "$GITHUB_OUTPUT"
setup-instance:
name: Setup instance (fast-tests)
if: github.event_name != 'pull_request' ||
needs.should-run.outputs.any_file_changed == 'true'
needs: should-run
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -135,9 +37,7 @@ jobs:
fast-tests:
name: Fast CPU tests
if: github.event_name != 'pull_request' ||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
needs: [ should-run, setup-instance ]
needs: setup-instance
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
@@ -153,58 +53,55 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: stable
- name: Run concrete-csprng tests
if: needs.should-run.outputs.csprng_test == 'true'
run: |
make test_concrete_csprng
- name: Run tfhe-zk-pok tests
if: needs.should-run.outputs.zk_pok_test == 'true'
run: |
make test_zk_pok
- name: Run core tests
if: needs.should-run.outputs.core_crypto_test == 'true'
run: |
AVX512_SUPPORT=ON make test_core_crypto
- name: Run boolean tests
if: needs.should-run.outputs.boolean_test == 'true'
run: |
make test_boolean
- name: Run user docs tests
if: needs.should-run.outputs.user_docs_test == 'true'
run: |
make test_user_doc
- name: Run js on wasm API tests
if: needs.should-run.outputs.wasm_test == 'true'
run: |
make test_nodejs_wasm_api_in_docker
- name: Gen Keys if required
if: needs.should-run.outputs.shortint_test == 'true' ||
needs.should-run.outputs.integer_test == 'true'
run: |
make gen_key_cache
- name: Run shortint tests
if: needs.should-run.outputs.shortint_test == 'true'
run: |
BIG_TESTS_INSTANCE=TRUE FAST_TESTS=TRUE make test_shortint_ci
- name: Run integer tests
if: needs.should-run.outputs.integer_test == 'true'
run: |
BIG_TESTS_INSTANCE=TRUE FAST_TESTS=TRUE make test_integer_ci
- name: Run shortint multi-bit tests
run: |
BIG_TESTS_INSTANCE=TRUE FAST_TESTS=TRUE make test_shortint_multi_bit_ci
- name: Run integer multi-bit tests
run: |
BIG_TESTS_INSTANCE=TRUE FAST_TESTS=TRUE make test_integer_multi_bit_ci
- name: Run high-level API tests
if: needs.should-run.outputs.high_level_api_test == 'true'
run: |
make test_high_level_api
@@ -228,7 +125,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -17,16 +17,11 @@ on:
workflow_dispatch:
pull_request:
types: [ labeled ]
schedule:
# Nightly tests @ 1AM after each work day
- cron: "0 1 * * MON-FRI"
jobs:
cuda-tests-linux:
name: CUDA tests (RTX 4090)
if: github.event_name == 'workflow_dispatch' ||
contains(github.event.label.name, '4090_test') ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, '4090_test') }}
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
@@ -39,7 +34,7 @@ jobs:
persist-credentials: 'false'
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: stable

View File

@@ -1,5 +1,5 @@
# Compile and test tfhe-cuda-backend on an AWS instance
name: TFHE Cuda Backend - Fast tests
name: TFHE Cuda Backend - Full tests
env:
CARGO_TERM_COLOR: always
@@ -11,7 +11,6 @@ env:
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -19,64 +18,26 @@ on:
pull_request:
jobs:
should-run:
runs-on: ubuntu-latest
permissions:
pull-requests: write
outputs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |
gpu:
- tfhe/Cargo.toml
- tfhe/build.rs
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
- '.github/workflows/gpu_fast_tests.yml'
- Makefile
- scripts/**
- ci/**
setup-instance:
name: Setup instance (cuda-tests)
needs: should-run
if: github.event_name != 'pull_request' ||
needs.should-run.outputs.gpu_test == 'true'
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
backend: aws
profile: gpu-test
cuda-tests-linux:
name: CUDA tests
needs: [ should-run, setup-instance ]
if: github.event_name != 'pull_request' ||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
cuda-pcc:
name: CUDA post-commit checks
needs: setup-instance
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
@@ -88,23 +49,11 @@ jobs:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
@@ -115,7 +64,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: stable
@@ -138,14 +87,77 @@ jobs:
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run core crypto and internal CUDA backend tests
- name: Run fmt checks
run: |
make test_core_crypto_gpu
make test_cuda_backend
make check_fmt_gpu
- name: Run clippy checks
run: |
make pcc_gpu
- name: Slack Notification
if: ${{ always() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "CUDA AWS post-commit checks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
cuda-tests-linux:
name: CUDA tests
needs: [ setup-instance, cuda-pcc ]
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: stable
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Run core crypto, integer and internal CUDA backend tests
run: |
make test_gpu
- name: Run user docs tests
run: |
@@ -159,28 +171,23 @@ jobs:
run: |
make test_high_level_api_gpu
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
continue-on-error: true
steps:
- name: Send message
- name: Slack Notification
if: ${{ always() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "CUDA AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-tests-linux ]
needs: [ setup-instance, cuda-pcc, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -13,66 +13,24 @@ env:
# We clear the cache to reduce memory pressure because of the numerous processes of cargo
# nextest
TFHE_RS_CLEAR_IN_MEMORY_KEY_CACHE: "1"
NO_BIG_PARAMS: FALSE
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [labeled]
push:
branches:
- main
types: [ labeled ]
jobs:
should-run:
if:
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
github.event_name == 'workflow_dispatch'
runs-on: ubuntu-latest
permissions:
pull-requests: write
outputs:
integer_test: ${{ github.event_name == 'workflow_dispatch' ||
steps.changed-files.outputs.integer_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
persist-credentials: "false"
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |
integer:
- tfhe/Cargo.toml
- concrete-csprng/**
- tfhe-zk-pok/**
- tfhe/src/core_crypto/**
- tfhe/src/shortint/**
- tfhe/src/integer/**
setup-instance:
name: Setup instance (unsigned-integer-tests)
needs: should-run
if:
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
github.event_name == 'workflow_dispatch'
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -85,29 +43,24 @@ jobs:
name: Unsigned integer tests
needs: setup-instance
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: "false"
persist-credentials: 'false'
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: stable
- name: Should skip big parameters set
if: github.event_name == 'pull_request'
run: |
echo "NO_BIG_PARAMS=TRUE" >> "${GITHUB_ENV}"
- name: Gen Keys if required
run: |
make GEN_KEY_CACHE_MULTI_BIT_ONLY=TRUE gen_key_cache
@@ -122,7 +75,7 @@ jobs:
- name: Run unsigned integer tests
run: |
AVX512_SUPPORT=ON NO_BIG_PARAMS=${{ env.NO_BIG_PARAMS }} BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_ci
AVX512_SUPPORT=ON BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_ci
- name: Slack Notification
if: ${{ always() }}
@@ -135,12 +88,12 @@ jobs:
teardown-instance:
name: Teardown instance (unsigned-integer-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [setup-instance, unsigned-integer-tests]
needs: [ setup-instance, unsigned-integer-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -1,5 +1,5 @@
# Perfom tfhe-cuda-backend post-commit checks on an AWS instance
name: TFHE Cuda Backend - Post-commit Checks
# Compile and test tfhe-cuda-backend on an AWS instance
name: TFHE Cuda Backend - Full tests multi-GPU
env:
CARGO_TERM_COLOR: always
@@ -13,32 +13,34 @@ env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
jobs:
setup-instance:
name: Setup instance (cuda-pcc)
name: Setup instance (cuda-tests-multi-gpu)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: gpu-build
profile: multi-gpu-test
cuda-pcc:
name: CUDA post-commit checks
needs: setup-instance
cuda-tests-linux:
name: CUDA multi-GPU tests
needs: [ setup-instance ]
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
fail-fast: false
@@ -54,15 +56,13 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: stable
@@ -85,13 +85,21 @@ jobs:
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Run fmt checks
- name: Run core crypto, integer and internal CUDA backend tests
run: |
make check_fmt_gpu
make test_gpu
- name: Run clippy checks
- name: Run user docs tests
run: |
make pcc_gpu
make test_user_doc_gpu
- name: Test C API
run: |
make test_c_api_gpu
- name: Run High Level API Tests
run: |
make test_high_level_api_gpu
- name: Slack Notification
if: ${{ always() }}
@@ -99,17 +107,17 @@ jobs:
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "CUDA AWS post-commit checks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "CUDA AWS multi-GPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-pcc)
name: Teardown instance (cuda-tests-multi-gpu)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-pcc ]
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -123,4 +131,4 @@ jobs:
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-pcc) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "Instance teardown (cuda-tests-multi-gpu) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -13,66 +13,24 @@ env:
# We clear the cache to reduce memory pressure because of the numerous processes of cargo
# nextest
TFHE_RS_CLEAR_IN_MEMORY_KEY_CACHE: "1"
NO_BIG_PARAMS: FALSE
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [labeled]
push:
branches:
- main
types: [ labeled ]
jobs:
should-run:
if:
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
github.event_name == 'workflow_dispatch'
runs-on: ubuntu-latest
permissions:
pull-requests: write
outputs:
integer_test: ${{ github.event_name == 'workflow_dispatch' ||
steps.changed-files.outputs.integer_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
persist-credentials: "false"
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |
integer:
- tfhe/Cargo.toml
- concrete-csprng/**
- tfhe-zk-pok/**
- tfhe/src/core_crypto/**
- tfhe/src/shortint/**
- tfhe/src/integer/**
setup-instance:
name: Setup instance (unsigned-integer-tests)
needs: should-run
if:
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
github.event_name == 'workflow_dispatch'
name: Setup instance (signed-integer-tests)
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -85,29 +43,24 @@ jobs:
name: Signed integer tests
needs: setup-instance
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: "false"
persist-credentials: 'false'
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: stable
- name: Should skip big parameters set
if: github.event_name == 'pull_request'
run: |
echo "NO_BIG_PARAMS=TRUE" >> "${GITHUB_ENV}"
- name: Gen Keys if required
run: |
make GEN_KEY_CACHE_MULTI_BIT_ONLY=TRUE gen_key_cache
@@ -126,7 +79,7 @@ jobs:
- name: Run signed integer tests
run: |
AVX512_SUPPORT=ON NO_BIG_PARAMS=${{ env.NO_BIG_PARAMS }} BIG_TESTS_INSTANCE=TRUE make test_signed_integer_ci
AVX512_SUPPORT=ON BIG_TESTS_INSTANCE=TRUE make test_signed_integer_ci
- name: Slack Notification
if: ${{ always() }}
@@ -139,12 +92,12 @@ jobs:
teardown-instance:
name: Teardown instance (signed-integer-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [setup-instance, signed-integer-tests]
needs: [ setup-instance, signed-integer-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -63,7 +63,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
uses: tj-actions/changed-files@d6babd6899969df1a11d14c368283ea4436bca78
with:
since_last_remote_commit: true
files_yaml: |
@@ -86,8 +86,6 @@ jobs:
high_level_api:
- tfhe/src/**
- '!tfhe/src/c_api/**'
- '!tfhe/src/boolean/**'
- '!tfhe/src/js_on_wasm_api/**'
c_api:
- tfhe/src/**
examples:
@@ -123,7 +121,7 @@ jobs:
setup-instance:
name: Setup instance (cpu-tests)
if: github.event_name != 'pull_request' ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.any_file_changed == 'true')
(github.event_name == 'pull_request' && needs.should-run.outputs.any_file_changed == 'true')
needs: should-run
runs-on: ubuntu-latest
outputs:
@@ -131,7 +129,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -160,7 +158,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: stable
@@ -237,7 +235,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -27,7 +27,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -54,7 +54,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: stable
@@ -72,7 +72,7 @@ jobs:
- name: Run parallel wasm tests
run: |
make test_web_js_api_parallel_ci
make ci_test_web_js_api_parallel
- name: Slack Notification
if: ${{ always() }}
@@ -90,7 +90,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -1,145 +0,0 @@
# Run core crypto benchmarks on an AWS instance and return parsed results to Slab CI bot.
name: Core crypto benchmarks
on:
workflow_dispatch:
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
jobs:
setup-instance:
name: Setup instance (core-crypto-benchmarks)
runs-on: ubuntu-latest
if: github.event_name != 'schedule' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: bench
core-crypto-benchmarks:
name: Execute core crypto benchmarks in EC2
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
with:
toolchain: nightly
- name: Run benchmarks with AVX512
run: |
make bench_pbs
make bench_pbs128
make bench_ks
- name: Parse results
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "hpc7a.96xlarge" \
--project-version "${{ env.COMMIT_HASH }}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--bench-date "${{ env.BENCH_DATE }}" \
--name-suffix avx512 \
--walk-subdirs \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on downloaded artifact"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "PBS benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (core-crypto-benchmarks)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, core-crypto-benchmarks ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (core-crypto-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -1,191 +0,0 @@
# Run all signed integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
name: Signed Integer full benchmarks
on:
workflow_dispatch:
inputs:
all_precisions:
description: "Run all precisions"
type: boolean
default: false
schedule:
# Weekly benchmarks will be triggered each Saturday at 1a.m.
- cron: '0 1 * * 6'
# Quarterly benchmarks will be triggered right before end of quarter, the 25th of the current month at 4a.m.
# These benchmarks are far longer to execute hence the reason to run them only four time a year.
- cron: '0 4 25 MAR,JUN,SEP,DEC *'
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
FAST_BENCH: TRUE
jobs:
prepare-matrix:
name: Prepare operations matrix
runs-on: ubuntu-latest
if: github.event_name != 'schedule' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
outputs:
op_flavor: ${{ steps.set_op_flavor.outputs.op_flavor }}
steps:
- name: Weekly benchmarks
if: github.event_name == 'workflow_dispatch' ||
github.event.schedule == '0 1 * * 6'
run: |
echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
- name: Quarterly benchmarks
if: github.event.schedule == '0 4 25 MAR,JUN,SEP,DEC *'
run: |
echo "OP_FLAVOR=[\"default\", \"unchecked\"]" >> "${GITHUB_ENV}"
- name: Set operation flavor output
id: set_op_flavor
run: |
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
setup-instance:
name: Setup instance (signed-integer-benchmarks)
needs: prepare-matrix
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: bench
signed-integer-benchmarks:
name: Execute signed integer benchmarks for all operations flavor
needs: [ prepare-matrix, setup-instance ]
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
continue-on-error: true
timeout-minutes: 1440 # 24 hours
strategy:
max-parallel: 1
matrix:
command: [ integer, integer_multi_bit ]
op_flavor: [ default, unchecked ]
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Should run benchmarks with all precisions
if: inputs.all_precisions
run: |
echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"
- name: Run benchmarks with AVX512
run: |
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_signed_${{ matrix.command }}
- name: Parse results
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "hpc7a.96xlarge" \
--project-version "${{ env.COMMIT_HASH }}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Signed integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (integer-benchmarks)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, signed-integer-benchmarks ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (signed-integer-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -1,197 +0,0 @@
# Run PKE Zero-Knowledge benchmarks on an instance and return parsed results to Slab CI bot.
name: PKE ZK benchmarks
on:
workflow_dispatch:
push:
branches:
- main
schedule:
# Weekly benchmarks will be triggered each Saturday at 3a.m.
- cron: '0 3 * * 6'
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
jobs:
should-run:
runs-on: ubuntu-latest
if: github.event_name == 'workflow_dispatch' ||
((github.event_name == 'push' || github.event_name == 'schedule') && github.repository == 'zama-ai/tfhe-rs')
outputs:
zk_pok_changed: ${{ steps.changed-files.outputs.zk_pok_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |
zk_pok:
- tfhe/Cargo.toml
- concrete-csprng/**
- tfhe-zk-pok/**
- tfhe/src/core_crypto/**
- tfhe/src/shortint/**
- tfhe/src/integer/**
- tfhe/src/zk.rs
- tfhe/benches/integer/zk_pke.rs
- .github/workflows/zk_pke_benchmark.yml
setup-instance:
name: Setup instance (pke-zk-benchmarks)
runs-on: ubuntu-latest
needs: should-run
if: github.event_name == 'workflow_dispatch' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
(github.event_name == 'push' &&
github.repository == 'zama-ai/tfhe-rs' &&
needs.should-run.outputs.zk_pok_changed == 'true')
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: bench
pke-zk-benchmarks:
name: Execute PKE ZK benchmarks
if: needs.setup-instance.result != 'skipped'
needs: setup-instance
concurrency:
group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Run benchmarks with AVX512
run: |
make bench_integer_zk
- name: Parse results
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "hpc7a.96xlarge" \
--backend cpu \
--project-version "${{ env.COMMIT_HASH }}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
- name: Parse CRS sizes results
run: |
python3 ./ci/benchmark_parser.py tfhe/pke_zk_crs_sizes.csv ${{ env.RESULTS_FILENAME }} \
--key-sizes \
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_integer_zk
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ !success() && !cancelled() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "PKE ZK benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (pke-zk-benchmarks)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, pke-zk-benchmarks ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (pke-zk-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -3,9 +3,30 @@ name: Boolean benchmarks
on:
workflow_dispatch:
schedule:
# Weekly benchmarks will be triggered each Saturday at 1a.m.
- cron: '0 1 * * 6'
inputs:
instance_id:
description: "Instance ID"
type: string
instance_image_id:
description: "Instance AMI ID"
type: string
instance_type:
description: "Instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
# This input is not used in this workflow but still mandatory since a calling workflow could
# use it. If a triggering command include a user_inputs field, then the triggered workflow
# must include this very input, otherwise the workflow won't be called.
# See start_full_benchmarks.yml as example.
user_inputs:
description: "Type of benchmarks to run"
type: string
default: "weekly_benchmarks"
env:
CARGO_TERM_COLOR: always
@@ -13,60 +34,36 @@ env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
jobs:
setup-instance:
name: Setup instance (boolean-benchmarks)
runs-on: ubuntu-latest
if: github.event_name != 'schedule' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: bench
boolean-benchmarks:
run-boolean-benchmarks:
name: Execute boolean benchmarks in EC2
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
continue-on-error: true
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
steps:
- name: Instance configuration used
run: |
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
- name: Get benchmark date
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
@@ -76,12 +73,14 @@ jobs:
- name: Parse results
run: |
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "hpc7a.96xlarge" \
--project-version "${{ env.COMMIT_HASH }}" \
--hardware ${{ inputs.instance_type }} \
--project-version "${COMMIT_HASH}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--commit-date "${COMMIT_DATE}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
@@ -98,7 +97,7 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_boolean
path: ${{ env.RESULTS_FILENAME }}
@@ -130,28 +129,8 @@ jobs:
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Boolean benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (boolean-benchmarks)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, boolean-benchmarks ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (boolean-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -19,21 +19,14 @@ jobs:
strategy:
matrix:
# GitHub macos-latest are now M1 macs, so use ours, we limit what runs so it will be fast
# even with a few PRs
os: [large_ubuntu_16, macos-latest, windows-latest]
os: [ubuntu-latest, macos-latest-large, windows-latest]
fail-fast: false
steps:
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
with:
toolchain: stable
- name: Install and run newline linter checks
if: ${{ contains(matrix.os, 'ubuntu') }}
if: matrix.os == 'ubuntu-latest'
run: |
wget https://github.com/fernandrone/linelint/releases/download/0.0.6/linelint-linux-amd64
echo "16b70fb7b471d6f95cbdc0b4e5dc2b0ac9e84ba9ecdc488f7bdf13df823aca4b linelint-linux-amd64" > checksum
@@ -43,33 +36,27 @@ jobs:
make check_newline
- name: Run pcc checks
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |
make pcc
- name: Build concrete-csprng
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |
make build_concrete_csprng
- name: Build Release core
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |
make build_core AVX512_SUPPORT=ON
make build_core_experimental AVX512_SUPPORT=ON
- name: Build Release boolean
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |
make build_boolean
- name: Build Release shortint
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |
make build_shortint
- name: Build Release integer
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |
make build_integer
@@ -78,12 +65,10 @@ jobs:
make build_tfhe_full
- name: Build Release c_api
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |
make build_c_api
- name: Build coverage tests
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |
make build_tfhe_coverage

View File

@@ -10,7 +10,7 @@ jobs:
- name: Check first line
uses: gsactions/commit-message-checker@16fa2d5de096ae0d35626443bcd24f1e756cafee
with:
pattern: '^((feat|fix|chore|refactor|style|test|docs|doc)(\([\w\-_]+\))?\!?\:) .+$'
pattern: '^((feat|fix|chore|refactor|style|test|docs|doc)(\([\w\-_]+\))?\:) .+$'
flags: "gs"
error: 'Your first line has to contain a commit type and scope like "feat(my_feature): msg".'
excludeDescription: "true" # optional: this excludes the description body of a pull request

View File

@@ -25,7 +25,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -51,13 +51,13 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: stable
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
uses: tj-actions/changed-files@d6babd6899969df1a11d14c368283ea4436bca78
with:
files_yaml: |
tfhe:
@@ -125,7 +125,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -0,0 +1,128 @@
# Run core crypto benchmarks on an AWS instance and return parsed results to Slab CI bot.
name: Core crypto benchmarks
on:
workflow_dispatch:
inputs:
instance_id:
description: "Instance ID"
type: string
instance_image_id:
description: "Instance AMI ID"
type: string
instance_type:
description: "Instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
# This input is not used in this workflow but still mandatory since a calling workflow could
# use it. If a triggering command include a user_inputs field, then the triggered workflow
# must include this very input, otherwise the workflow won't be called.
# See start_full_benchmarks.yml as example.
user_inputs:
description: "Type of benchmarks to run"
type: string
default: "weekly_benchmarks"
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-core-crypto-benchmarks:
name: Execute core crypto benchmarks in EC2
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
steps:
- name: Instance configuration used
run: |
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
- name: Get benchmark date
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
- name: Run benchmarks with AVX512
run: |
make bench_pbs
make bench_pbs128
make bench_ks
- name: Parse results
run: |
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware ${{ inputs.instance_type }} \
--project-version "${COMMIT_HASH}" \
--branch ${{ github.ref_name }} \
--commit-date "${COMMIT_DATE}" \
--bench-date "${{ env.BENCH_DATE }}" \
--name-suffix avx512 \
--walk-subdirs \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on downloaded artifact"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "PBS benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -27,7 +27,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -50,7 +50,7 @@ jobs:
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
CMAKE_VERSION: 3.29.1
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
@@ -83,7 +83,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
@@ -128,7 +128,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}
@@ -175,7 +175,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -27,7 +27,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -54,7 +54,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: stable
@@ -78,7 +78,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -1,123 +0,0 @@
name: Close or Merge corresponding PR on the data repo
# When a PR with the data_PR tag is closed or merged, this will close the corresponding PR in the data repo.
env:
TARGET_REPO_API_URL: ${{ github.api_url }}/repos/zama-ai/tfhe-backward-compat-data
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
PR_BRANCH: ${{ github.head_ref || github.ref_name }}
CLOSE_TYPE: ${{ github.event.pull_request.merged && 'merge' || 'close' }}
# only trigger on pull request closed events
on:
pull_request:
types: [ closed ]
# The same pattern is used for jobs that use the github api:
# - save the result of the API call in the env var "GH_API_RES". Since the var is multiline
# we use this trick: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#example-of-a-multiline-string
# - "set +e" will make sure we reach the last "echo EOF" even in case of error
# - "set -o" pipefail makes one line piped command return the error of the first failure
# - 'RES="$?"' and 'exit $RES' are used to return the error code if a command failed. Without it, with "set +e"
# the script will always return 0 because of the "echo EOF".
jobs:
auto_close_job:
if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
runs-on: ubuntu-latest
steps:
- name: Find corresponding Pull Request in the data repo
run: |
{
set +e
set -o pipefail
echo 'TARGET_REPO_PR<<EOF'
curl --fail-with-body --no-progress-meter -L -X GET \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
${{ env.TARGET_REPO_API_URL }}/pulls\?head=${{ github.repository_owner }}:${{ env.PR_BRANCH }} | jq -e '.[0]' | sed 's/null/{ "message": "corresponding PR not found" }/'
RES="$?"
echo EOF
} >> "${GITHUB_ENV}"
exit $RES
- name: Comment on the PR to indicate the reason of the close
run: |
{
set +e
set -o pipefail
echo 'GH_API_RES<<EOF'
curl --fail-with-body --no-progress-meter -L -X POST \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
-H "X-GitHub-Api-Version: 2022-11-28" \
${{ fromJson(env.TARGET_REPO_PR).comments_url }} \
-d '{ "body": "PR ${{ env.CLOSE_TYPE }}d because the corresponding PR in main repo was ${{ env.CLOSE_TYPE }}d: ${{ github.repository }}#${{ github.event.number }}" }'
RES="$?"
echo EOF
} >> "${GITHUB_ENV}"
exit $RES
- name: Merge the Pull Request in the data repo
if: ${{ github.event.pull_request.merged }}
run: |
{
set +e
set -o pipefail
echo 'GH_API_RES<<EOF'
curl --fail-with-body --no-progress-meter -L -X PUT \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
-H "X-GitHub-Api-Version: 2022-11-28" \
${{ fromJson(env.TARGET_REPO_PR).url }}/merge \
-d '{ "merge_method": "rebase" }'
RES="$?"
echo EOF
} >> "${GITHUB_ENV}"
exit $RES
- name: Close the Pull Request in the data repo
if: ${{ !github.event.pull_request.merged }}
run: |
{
set +e
set -o pipefail
echo 'GH_API_RES<<EOF'
curl --fail-with-body --no-progress-meter -L -X PATCH \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
-H "X-GitHub-Api-Version: 2022-11-28" \
${{ fromJson(env.TARGET_REPO_PR).url }} \
-d '{ "state": "closed" }'
RES="$?"
echo EOF
} >> "${GITHUB_ENV}"
exit $RES
- name: Delete the associated branch in the data repo
run: |
{
set +e
set -o pipefail
echo 'GH_API_RES<<EOF'
curl --fail-with-body --no-progress-meter -L -X DELETE \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
-H "X-GitHub-Api-Version: 2022-11-28" \
${{ env.TARGET_REPO_API_URL }}/git/refs/heads/${{ env.PR_BRANCH }}
RES="$?"
echo EOF
} >> "${GITHUB_ENV}"
exit $RES
- name: Slack Notification
if: ${{ always() && job.status == 'failure' }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Failed to auto-${{ env.CLOSE_TYPE }} PR on data repo: ${{ fromJson(env.GH_API_RES || env.TARGET_REPO_PR).message }}"

View File

@@ -1,5 +1,5 @@
# Run benchmarks on an RTX 4090 machine and return parsed results to Slab CI bot.
name: TFHE Cuda Backend - 4090 benchmarks
# Run all benchmarks on an RTX 4090 machine and return parsed results to Slab CI bot.
name: TFHE Cuda Backend - 4090 full benchmarks
env:
CARGO_TERM_COLOR: always
@@ -11,7 +11,6 @@ env:
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
FAST_BENCH: TRUE
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -24,18 +23,19 @@ on:
jobs:
cuda-integer-benchmarks:
name: Cuda integer benchmarks (RTX 4090)
if: ${{ github.event_name == 'workflow_dispatch' ||
github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs' ||
contains(github.event.label.name, '4090_bench') }}
name: Cuda integer benchmarks for all operations flavor (RTX 4090)
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}_cuda_integer_bench
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
cancel-in-progress: true
runs-on: ["self-hosted", "4090-desktop"]
timeout-minutes: 1440 # 24 hours
strategy:
fail-fast: false
max-parallel: 1
matrix:
command: [integer, integer_multi_bit]
op_flavor: [default, unchecked]
steps:
- name: Checkout tfhe-rs
@@ -50,10 +50,9 @@ jobs:
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
echo "FAST_BENCH=TRUE" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
@@ -66,7 +65,7 @@ jobs:
- name: Run integer benchmarks
run: |
make BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
- name: Parse results
run: |
@@ -82,9 +81,9 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_integer_multi_bit_gpu_default
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
- name: Send data to Slab
@@ -115,7 +114,7 @@ jobs:
needs: cuda-integer-benchmarks
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}_cuda_core_crypto_bench
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
cancel-in-progress: true
runs-on: ["self-hosted", "4090-desktop"]
timeout-minutes: 1440 # 24 hours
@@ -134,7 +133,7 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
@@ -145,7 +144,7 @@ jobs:
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Run core crypto benchmarks
- name: Run integer benchmarks
run: |
make bench_pbs_gpu
make bench_ks_gpu
@@ -164,7 +163,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -1,199 +0,0 @@
# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
name: TFHE Cuda Backend - Fast tests on H100
env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled ]
jobs:
should-run:
runs-on: ubuntu-latest
permissions:
pull-requests: write
outputs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |
gpu:
- tfhe/Cargo.toml
- tfhe/build.rs
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
- Makefile
- '.github/workflows/gpu_fast_h100_tests.yml'
- scripts/**
- ci/**
setup-instance:
name: Setup instance (cuda-h100-tests)
needs: should-run
if: github.event_name != 'pull_request' ||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: single-h100
cuda-tests-linux:
name: CUDA H100 tests
needs: [ should-run, setup-instance ]
if: github.event_name != 'pull_request' ||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
with:
toolchain: stable
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run core crypto and internal CUDA backend tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
BIG_TESTS_INSTANCE=TRUE make test_cuda_backend
- name: Run user docs tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
- name: Test C API
run: |
BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
- name: Run High Level API Tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
SLACK_MESSAGE: "Fast H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-h100-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -1,199 +0,0 @@
# Compile and test tfhe-cuda-backend on an AWS instance
name: TFHE Cuda Backend - Full tests multi-GPU
env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled ]
jobs:
should-run:
runs-on: ubuntu-latest
permissions:
pull-requests: write
outputs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |
gpu:
- tfhe/Cargo.toml
- tfhe/build.rs
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
- Makefile
- '.github/workflows/**_multi_gpu_tests.yml'
- scripts/**
- ci/**
setup-instance:
name: Setup instance (cuda-tests-multi-gpu)
needs: should-run
if: github.event_name != 'pull_request' ||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: multi-gpu-test
cuda-tests-linux:
name: CUDA multi-GPU tests
needs: [ should-run, setup-instance ]
if: github.event_name != 'pull_request' ||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
with:
toolchain: stable
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
# No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
- name: Run multi-bit CUDA integer tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_integer_multi_bit_gpu_ci
- name: Run user docs tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
- name: Test C API
run: |
BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
- name: Run High Level API Tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
SLACK_MESSAGE: "Multi-GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-tests-multi-gpu)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-tests-multi-gpu) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -1,202 +0,0 @@
# Compile and test tfhe-cuda-backend signed integer on an AWS instance
name: TFHE Cuda Backend - Signed integer tests
env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
FAST_TESTS: TRUE
NIGHTLY_TESTS: FALSE
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types:
- opened
- synchronize
- labeled
schedule:
# Nightly tests @ 1AM after each work day
- cron: "0 1 * * MON-FRI"
jobs:
should-run:
runs-on: ubuntu-latest
permissions:
pull-requests: write
outputs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |
gpu:
- tfhe/Cargo.toml
- tfhe/build.rs
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
- '.github/workflows/gpu_signed_integer_tests.yml'
- Makefile
- scripts/**
- ci/**
setup-instance:
name: Setup instance (cuda-signed-integer-tests)
runs-on: ubuntu-latest
needs: should-run
if: (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
github.event_name == 'workflow_dispatch' ||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true')
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: gpu-test
cuda-signed-integer-tests:
name: CUDA signed integer tests
needs: [ should-run, setup-instance ]
if: github.event_name != 'pull_request' ||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
with:
toolchain: stable
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Should run nightly tests
if: github.event_name == 'schedule'
run: |
{
echo "FAST_TESTS=FALSE";
echo "NIGHTLY_TESTS=TRUE";
} >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run signed integer multi-bit tests
run: |
make test_signed_integer_multi_bit_gpu_ci
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-signed-integer-tests ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-signed-integer-tests.result != 'skipped' }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ needs.cuda-signed-integer-tests.result }}
SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-signed-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-signed-integer-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -1,188 +0,0 @@
# Test unsigned integers on an H100 VM on hyperstack
name: TFHE Cuda Backend - Unsigned integer tests on H100
env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled ]
jobs:
should-run:
runs-on: ubuntu-latest
permissions:
pull-requests: write
outputs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |
gpu:
- tfhe/Cargo.toml
- tfhe/build.rs
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
- Makefile
- '.github/workflows/gpu_unsigned_integer_tests.yml'
- scripts/**
- ci/**
setup-instance:
name: Setup instance (cuda-h100-tests)
needs: should-run
if: github.event_name != 'pull_request' ||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: single-h100
cuda-tests-linux:
name: CUDA H100 unsigned integer tests
needs: [ should-run, setup-instance ]
if: github.event_name != 'pull_request' ||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
with:
toolchain: stable
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run unsigned integer tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_gpu_ci
- name: Run unsigned integer multi-bit tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_multi_bit_gpu_ci
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
SLACK_MESSAGE: "Unsigned integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-h100-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -1,199 +0,0 @@
# Compile and test tfhe-cuda-backend unsigned integer on an AWS instance
name: TFHE Cuda Backend - Unsigned integer tests
env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
FAST_TESTS: TRUE
NIGHTLY_TESTS: FALSE
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types:
- opened
- synchronize
- labeled
schedule:
# Nightly tests @ 1AM after each work day
- cron: "0 1 * * MON-FRI"
jobs:
should-run:
runs-on: ubuntu-latest
permissions:
pull-requests: write
outputs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |
gpu:
- tfhe/Cargo.toml
- tfhe/build.rs
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
- '.github/workflows/gpu_unsigned_integer_tests.yml'
- Makefile
- scripts/**
- ci/**
setup-instance:
name: Setup instance (cuda-unsigned-integer-tests)
needs: should-run
if: (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
github.event_name == 'workflow_dispatch' ||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: gpu-test
cuda-unsigned-integer-tests:
name: CUDA unsigned integer tests
needs: [ should-run, setup-instance ]
if: github.event_name != 'pull_request' ||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
with:
toolchain: stable
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Should run nightly tests
if: github.event_name == 'schedule'
run: |
{
echo "FAST_TESTS=FALSE";
echo "NIGHTLY_TESTS=TRUE";
} >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run unsigned integer multi-bit tests
run: |
make test_unsigned_integer_multi_bit_gpu_ci
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-unsigned-integer-tests ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-unsigned-integer-tests.result != 'skipped' }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ needs.cuda-unsigned-integer-tests.result }}
SLACK_MESSAGE: "Unsigned integer GPU tests finished with status: ${{ needs.cuda-unsigned-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-unsigned-integer-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -1,5 +1,5 @@
# Signed integer GPU tests on an H100 VM on hyperstack
name: TFHE Cuda Backend - Signed integer tests on H100
# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
name: TFHE Cuda Backend - Full tests on H100
env:
CARGO_TERM_COLOR: always
@@ -11,61 +11,22 @@ env:
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled ]
jobs:
should-run:
runs-on: ubuntu-latest
permissions:
pull-requests: write
outputs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |
gpu:
- tfhe/Cargo.toml
- tfhe/build.rs
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
- Makefile
- '.github/workflows/gpu_signed_integer_h100_tests.yml'
- scripts/**
- ci/**
setup-instance:
name: Setup instance (cuda-h100-tests)
needs: should-run
if: github.event_name != 'pull_request' ||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -75,10 +36,8 @@ jobs:
profile: single-h100
cuda-tests-linux:
name: CUDA H100 signed integer tests
needs: [ should-run, setup-instance ]
if: github.event_name != 'pull_request' ||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
name: CUDA H100 tests
needs: [ setup-instance ]
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
@@ -93,13 +52,22 @@ jobs:
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
CMAKE_VERSION: 3.29.1
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
sudo apt install ca-certificates curl
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
@@ -115,7 +83,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: stable
@@ -138,23 +106,27 @@ jobs:
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run signed integer tests
- name: Run core crypto, integer and internal CUDA backend tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_signed_integer_gpu_ci
make test_gpu
- name: Run signed integer multi-bit tests
- name: Run user docs tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_signed_integer_multi_bit_gpu_ci
make test_user_doc_gpu
- name: Test C API
run: |
make test_c_api_gpu
- name: Run High Level API Tests
run: |
make test_high_level_api_gpu
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
if: ${{ !success() && !cancelled() }}
continue-on-error: true
steps:
- name: Send message
@@ -171,7 +143,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

130
.github/workflows/integer_benchmark.yml vendored Normal file
View File

@@ -0,0 +1,130 @@
# Run integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
name: Integer benchmarks
on:
workflow_dispatch:
inputs:
instance_id:
description: "Instance ID"
type: string
instance_image_id:
description: "Instance AMI ID"
type: string
instance_type:
description: "Instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-integer-benchmarks:
name: Execute integer benchmarks in EC2
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
steps:
- name: Instance configuration used
run: |
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
- name: Get benchmark date
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
- name: Run benchmarks with AVX512
run: |
make FAST_BENCH=TRUE bench_integer
- name: Parse benchmarks to csv
run: |
make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
- name: Parse results
run: |
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware ${{ inputs.instance_type }} \
--project-version "${COMMIT_HASH}" \
--branch ${{ github.ref_name }} \
--commit-date "${COMMIT_DATE}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -1,20 +1,28 @@
# Run all integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
name: Integer benchmarks
name: Integer full benchmarks
on:
workflow_dispatch:
inputs:
all_precisions:
description: "Run all precisions"
type: boolean
default: false
schedule:
# Weekly benchmarks will be triggered each Saturday at 1a.m.
- cron: '0 1 * * 6'
# Quarterly benchmarks will be triggered right before end of quarter, the 25th of the current month at 4a.m.
# These benchmarks are far longer to execute hence the reason to run them only four time a year.
- cron: '0 4 25 MAR,JUN,SEP,DEC *'
instance_id:
description: "Instance ID"
type: string
instance_image_id:
description: "Instance AMI ID"
type: string
instance_type:
description: "Instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
user_inputs:
description: "Type of benchmarks to run"
type: string
default: "weekly_benchmarks"
env:
CARGO_TERM_COLOR: always
@@ -22,29 +30,21 @@ env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
FAST_BENCH: TRUE
jobs:
prepare-matrix:
name: Prepare operations matrix
runs-on: ubuntu-latest
if: github.event_name != 'schedule' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
outputs:
op_flavor: ${{ steps.set_op_flavor.outputs.op_flavor }}
steps:
- name: Weekly benchmarks
if: github.event_name == 'workflow_dispatch' ||
github.event.schedule == '0 1 * * 6'
if: ${{ github.event.inputs.user_inputs == 'weekly_benchmarks' }}
run: |
echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
- name: Quarterly benchmarks
if: github.event.schedule == '0 4 25 MAR,JUN,SEP,DEC *'
if: ${{ github.event.inputs.user_inputs == 'quarterly_benchmarks' }}
run: |
echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> "${GITHUB_ENV}"
@@ -53,31 +53,11 @@ jobs:
run: |
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
setup-instance:
name: Setup instance (integer-benchmarks)
needs: prepare-matrix
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: bench
integer-benchmarks:
name: Execute integer benchmarks for all operations flavor
needs: [ prepare-matrix, setup-instance ]
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
needs: prepare-matrix
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
continue-on-error: true
timeout-minutes: 1440 # 24 hours
strategy:
@@ -86,6 +66,13 @@ jobs:
command: [ integer, integer_multi_bit]
op_flavor: ${{ fromJson(needs.prepare-matrix.outputs.op_flavor) }}
steps:
- name: Instance configuration used
run: |
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
@@ -105,7 +92,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
@@ -116,11 +103,6 @@ jobs:
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Should run benchmarks with all precisions
if: inputs.all_precisions
run: |
echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"
- name: Run benchmarks with AVX512
run: |
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}
@@ -129,7 +111,7 @@ jobs:
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "hpc7a.96xlarge" \
--hardware ${{ inputs.instance_type }} \
--project-version "${{ env.COMMIT_HASH }}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
@@ -139,7 +121,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
@@ -158,34 +140,19 @@ jobs:
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (integer-benchmarks)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, integer-benchmarks ]
runs-on: ubuntu-latest
slack-notification:
name: Slack Notification
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ failure() }}
needs: integer-benchmarks
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
- name: Notify
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (integer-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -23,14 +23,14 @@ jobs:
setup-instance:
name: Setup instance (cuda-integer-benchmarks)
runs-on: ubuntu-latest
if: github.event_name == 'workflow_dispatch' ||
if: github.event_name != 'push' ||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -53,7 +53,7 @@ jobs:
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
CMAKE_VERSION: 3.29.1
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
@@ -86,7 +86,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
@@ -110,10 +110,6 @@ jobs:
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
} >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run benchmarks with AVX512
run: |
make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu
@@ -124,7 +120,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -144,7 +140,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
@@ -174,7 +170,7 @@ jobs:
name: Slack Notification
needs: [ setup-instance, cuda-integer-benchmarks ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-integer-benchmarks.result != 'skipped' && failure() }}
if: ${{ !success() && !cancelled() }}
continue-on-error: true
steps:
- name: Send message
@@ -191,7 +187,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -29,7 +29,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -57,7 +57,7 @@ jobs:
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
CMAKE_VERSION: 3.29.1
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
@@ -90,7 +90,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
@@ -121,10 +121,6 @@ jobs:
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run benchmarks with AVX512
run: |
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
@@ -144,7 +140,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
@@ -184,7 +180,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -0,0 +1,130 @@
# Run integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
name: Integer Multi-bit benchmarks
on:
workflow_dispatch:
inputs:
instance_id:
description: "Instance ID"
type: string
instance_image_id:
description: "Instance AMI ID"
type: string
instance_type:
description: "Instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-integer-benchmarks:
name: Execute integer multi-bit benchmarks in EC2
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
steps:
- name: Instance configuration used
run: |
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
- name: Get benchmark date
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
- name: Run multi-bit benchmarks with AVX512
run: |
make FAST_BENCH=TRUE bench_integer_multi_bit
- name: Parse benchmarks to csv
run: |
make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
- name: Parse results
run: |
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware ${{ inputs.instance_type }} \
--project-version "${COMMIT_HASH}" \
--branch ${{ github.ref_name }} \
--commit-date "${COMMIT_DATE}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -3,16 +3,6 @@ name: Integer GPU Multi-bit benchmarks
on:
workflow_dispatch:
inputs:
all_precisions:
description: "Run all precisions"
type: boolean
default: false
fast_default:
description: "Run only deduplicated default operations without scalar variants"
type: boolean
default: false
schedule:
# Weekly benchmarks will be triggered each Saturday at 1a.m.
- cron: '0 1 * * 6'
@@ -28,8 +18,6 @@ env:
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
FAST_BENCH: TRUE
BENCH_OP_FLAVOR: default
jobs:
setup-instance:
@@ -42,7 +30,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -66,7 +54,7 @@ jobs:
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
CMAKE_VERSION: 3.29.1
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
@@ -99,7 +87,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
@@ -123,23 +111,9 @@ jobs:
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
} >> "${GITHUB_ENV}"
- name: Should run benchmarks with all precisions
if: inputs.all_precisions
run: |
echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"
- name: Should run fast subset benchmarks
if: inputs.fast_default
run: |
echo "BENCH_OP_FLAVOR=fast_default" >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run multi-bit benchmarks with AVX512
run: |
make bench_unsigned_integer_multi_bit_gpu
make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu
- name: Parse benchmarks to csv
run: |
@@ -147,7 +121,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -167,7 +141,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
@@ -215,7 +189,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -3,16 +3,6 @@ name: Integer multi GPU Multi-bit benchmarks
on:
workflow_dispatch:
inputs:
all_precisions:
description: "Run all precisions"
type: boolean
default: false
fast_default:
description: "Run only deduplicated default operations without scalar variants"
type: boolean
default: false
schedule:
# Weekly benchmarks will be triggered each Saturday at 1a.m.
- cron: '0 1 * * 6'
@@ -27,29 +17,25 @@ env:
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
FAST_BENCH: TRUE
BENCH_OP_FLAVOR: default
jobs:
setup-instance:
name: Setup instance (cuda-integer-multi-bit-multi-gpu-benchmarks)
runs-on: ubuntu-latest
if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
github.event_name == 'workflow_dispatch' }}
if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: multi-h100
backend: aws
profile: multi-gpu-test
cuda-integer-multi-bit-multi-gpu-benchmarks:
name: Execute multi GPU integer multi-bit benchmarks
@@ -64,23 +50,11 @@ jobs:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
@@ -100,7 +74,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
@@ -131,29 +105,15 @@ jobs:
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Should run benchmarks with all precisions
if: inputs.all_precisions
run: |
echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"
- name: Should run fast subset benchmarks
if: inputs.fast_default
run: |
echo "BENCH_OP_FLAVOR=fast_default" >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run multi-bit benchmarks with AVX512
run: |
make bench_unsigned_integer_multi_bit_gpu
make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu
- name: Parse results
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "n3-H100x8" \
--hardware "p3.8xlarge" \
--backend gpu \
--project-version "${{ env.COMMIT_HASH }}" \
--branch ${{ github.ref_name }} \
@@ -164,7 +124,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
@@ -204,7 +164,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -29,14 +29,14 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: multi-h100
backend: aws
profile: multi-gpu-test
cuda-integer-full-multi-gpu-benchmarks:
name: Execute multi GPU integer benchmarks for all operations flavor
@@ -54,23 +54,11 @@ jobs:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
@@ -90,7 +78,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
@@ -121,10 +109,6 @@ jobs:
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run benchmarks with AVX512
run: |
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
@@ -133,7 +117,7 @@ jobs:
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "n3-H100x8" \
--hardware "p3.8xlarge" \
--backend gpu \
--project-version "${{ env.COMMIT_HASH }}" \
--branch ${{ github.ref_name }} \
@@ -144,7 +128,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
@@ -184,7 +168,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -39,7 +39,7 @@ jobs:
persist-credentials: 'false'
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: stable

View File

@@ -30,62 +30,19 @@ env:
NPM_TAG: ""
jobs:
package:
runs-on: ubuntu-latest
outputs:
hash: ${{ steps.hash.outputs.hash }}
steps:
- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Prepare package
run: |
cargo package -p tfhe
- uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
with:
name: crate
path: target/package/*.crate
- name: generate hash
id: hash
run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
provenance:
if: ${{ !inputs.dry_run }}
needs: [package]
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.0.0
permissions:
# Needed to detect the GitHub Actions environment
actions: read
# Needed to create the provenance via GitHub OIDC
id-token: write
# Needed to upload assets/artifacts
contents: write
with:
# SHA-256 hashes of the Crate package.
base64-subjects: ${{ needs.package.outputs.hash }}
publish_release:
name: Publish Release
needs: [package] # for comparing hashes
runs-on: ubuntu-latest
permissions:
contents: read
id-token: write
steps:
- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Create NPM version tag
if: ${{ inputs.npm_latest_tag }}
run: |
echo "NPM_TAG=latest" >> "${GITHUB_ENV}"
- name: Download artifact
uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
with:
name: crate
path: target/package
echo "NPM_TAG=$(sed -n -e '1,/^version/p' tfhe/Cargo.toml | grep '^version[[:space:]]*=' | cut -d '=' -f 2 | tr -d ' ')" >> "${GITHUB_ENV}"
- name: Publish crate.io package
if: ${{ inputs.push_to_crates }}
env:
@@ -94,22 +51,6 @@ jobs:
run: |
cargo publish -p tfhe --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
- name: Generate hash
id: published_hash
run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
- name: Slack notification (hashes comparison)
if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: failure
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "SLSA tfhe crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
- name: Build web package
if: ${{ inputs.push_web_package }}
run: |
@@ -123,7 +64,14 @@ jobs:
package: tfhe/pkg/package.json
dry-run: ${{ inputs.dry_run }}
tag: ${{ env.NPM_TAG }}
provenance: true
- name: Publish web package as latest
if: ${{ inputs.push_web_package && inputs.npm_latest_tag }}
uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
with:
token: ${{ secrets.NPM_TOKEN }}
package: tfhe/pkg/package.json
dry-run: ${{ inputs.dry_run }}
- name: Build Node package
if: ${{ inputs.push_node_package }}
@@ -141,7 +89,14 @@ jobs:
package: tfhe/pkg/package.json
dry-run: ${{ inputs.dry_run }}
tag: ${{ env.NPM_TAG }}
provenance: true
- name: Publish Node package as latest
if: ${{ inputs.push_node_package && inputs.npm_latest_tag }}
uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
with:
token: ${{ secrets.NPM_TOKEN }}
package: tfhe/pkg/package.json
dry-run: ${{ inputs.dry_run }}
- name: Slack Notification
if: ${{ failure() }}

View File

@@ -1,3 +1,4 @@
# Publish new release of tfhe-rs on various platform.
name: Publish concrete-csprng release
on:
@@ -36,6 +37,6 @@ jobs:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "concrete-csprng release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "concrete-csprng release failed: (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -1,36 +0,0 @@
name: Publish tfhe-versionable release
on:
workflow_dispatch:
env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
jobs:
publish_release:
name: Publish tfhe-versionable Release
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Publish crate.io package
env:
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
run: |
cargo publish -p tfhe-versionable-derive --token ${{ env.CRATES_TOKEN }}
cargo publish -p tfhe-versionable --token ${{ env.CRATES_TOKEN }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "tfhe-versionable release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -29,7 +29,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -63,7 +63,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: stable
@@ -112,7 +112,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -14,7 +14,7 @@ on:
jobs:
params-curves-security-check:
runs-on: large_ubuntu_16
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
@@ -24,7 +24,7 @@ jobs:
with:
repository: malb/lattice-estimator
path: lattice_estimator
ref: 'e80ec6bbbba212428b0e92d0467c18629cf9ed67'
ref: '53508253629d3b5d31a2ad110e85dc69391ccb95'
- name: Install Sage
run: |

128
.github/workflows/shortint_benchmark.yml vendored Normal file
View File

@@ -0,0 +1,128 @@
# Run shortint benchmarks on an AWS instance and return parsed results to Slab CI bot.
name: Shortint benchmarks
on:
workflow_dispatch:
inputs:
instance_id:
description: "Instance ID"
type: string
instance_image_id:
description: "Instance AMI ID"
type: string
instance_type:
description: "Instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-shortint-benchmarks:
name: Execute shortint benchmarks in EC2
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
steps:
- name: Instance configuration used
run: |
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
- name: Get benchmark date
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
- name: Run benchmarks with AVX512
run: |
make bench_shortint
- name: Parse results
run: |
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware ${{ inputs.instance_type }} \
--project-version "${COMMIT_HASH}" \
--branch ${{ github.ref_name }} \
--commit-date "${COMMIT_DATE}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
- name: Measure key sizes
run: |
make measure_shortint_key_sizes
- name: Parse key sizes results
run: |
python3 ./ci/benchmark_parser.py tfhe/shortint_key_sizes.csv ${{ env.RESULTS_FILENAME }} \
--key-sizes \
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_shortint
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Shortint benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -3,13 +3,30 @@ name: Shortint full benchmarks
on:
workflow_dispatch:
schedule:
# Weekly benchmarks will be triggered each Saturday at 1a.m.
- cron: '0 1 * * 6'
# Quarterly benchmarks will be triggered right before end of quarter, the 25th of the current month at 4a.m.
# These benchmarks are far longer to execute hence the reason to run them only four time a year.
- cron: '0 4 25 MAR,JUN,SEP,DEC *'
inputs:
instance_id:
description: "Instance ID"
type: string
instance_image_id:
description: "Instance AMI ID"
type: string
instance_type:
description: "Instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
# This input is not used in this workflow but still mandatory since a calling workflow could
# use it. If a triggering command include a user_inputs field, then the triggered workflow
# must include this very input, otherwise the workflow won't be called.
# See start_full_benchmarks.yml as example.
user_inputs:
description: "Type of benchmarks to run"
type: string
default: "weekly_benchmarks"
env:
CARGO_TERM_COLOR: always
@@ -17,67 +34,24 @@ env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
jobs:
prepare-matrix:
name: Prepare operations matrix
runs-on: ubuntu-latest
if: github.event_name != 'schedule' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
outputs:
op_flavor: ${{ steps.set_op_flavor.outputs.op_flavor }}
steps:
- name: Weekly benchmarks
if: github.event_name == 'workflow_dispatch' ||
github.event.schedule == '0 1 * * 6'
run: |
echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
- name: Quarterly benchmarks
if: github.event.schedule == '0 4 25 MAR,JUN,SEP,DEC *'
run: |
echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\"]" >> "${GITHUB_ENV}"
- name: Set operation flavor output
id: set_op_flavor
run: |
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
setup-instance:
name: Setup instance (shortint-benchmarks)
needs: prepare-matrix
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: bench
shortint-benchmarks:
name: Execute shortint benchmarks for all operations flavor
needs: [ prepare-matrix, setup-instance ]
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
continue-on-error: true
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
strategy:
max-parallel: 1
matrix:
op_flavor: ${{ fromJson(needs.prepare-matrix.outputs.op_flavor) }}
op_flavor: [ default, smart, unchecked ]
steps:
- name: Instance configuration used
run: |
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
@@ -97,7 +71,7 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
@@ -118,7 +92,7 @@ jobs:
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "hpc7a.96xlarge" \
--hardware ${{ inputs.instance_type }} \
--project-version "${COMMIT_HASH}" \
--branch ${{ github.ref_name }} \
--commit-date "${COMMIT_DATE}" \
@@ -141,7 +115,7 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
@@ -160,34 +134,19 @@ jobs:
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Shortint full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (shortint-benchmarks)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, shortint-benchmarks ]
runs-on: ubuntu-latest
slack-notification:
name: Slack Notification
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ failure() }}
needs: shortint-benchmarks
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
- name: Notify
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (shortint-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Shortint full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -0,0 +1,130 @@
# Run signed integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
name: Signed Integer benchmarks
on:
workflow_dispatch:
inputs:
instance_id:
description: "Instance ID"
type: string
instance_image_id:
description: "Instance AMI ID"
type: string
instance_type:
description: "Instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-integer-benchmarks:
name: Execute signed integer benchmarks in EC2
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
steps:
- name: Instance configuration used
run: |
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
- name: Get benchmark date
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
- name: Run benchmarks with AVX512
run: |
make FAST_BENCH=TRUE bench_signed_integer
- name: Parse benchmarks to csv
run: |
make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
- name: Parse results
run: |
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware ${{ inputs.instance_type }} \
--project-version "${COMMIT_HASH}" \
--branch ${{ github.ref_name }} \
--commit-date "${COMMIT_DATE}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Signed integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -0,0 +1,136 @@
# Run all signed integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
name: Signed Integer full benchmarks
on:
workflow_dispatch:
inputs:
instance_id:
description: "Instance ID"
type: string
instance_image_id:
description: "Instance AMI ID"
type: string
instance_type:
description: "Instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
user_inputs:
description: "Type of benchmarks to run"
type: string
default: "weekly_benchmarks"
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
integer-benchmarks:
name: Execute signed integer benchmarks for all operations flavor
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
continue-on-error: true
timeout-minutes: 1440 # 24 hours
strategy:
max-parallel: 1
matrix:
command: [ integer, integer_multi_bit ]
op_flavor: [ default, unchecked ]
steps:
- name: Instance configuration used
run: |
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Run benchmarks with AVX512
run: |
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_signed_${{ matrix.command }}
- name: Parse results
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware ${{ inputs.instance_type }} \
--project-version "${{ env.COMMIT_HASH }}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
slack-notification:
name: Slack Notification
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ failure() }}
needs: integer-benchmarks
steps:
- name: Notify
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Signed integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -0,0 +1,130 @@
# Run signed integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
name: Signed Integer Multi-bit benchmarks
on:
workflow_dispatch:
inputs:
instance_id:
description: "Instance ID"
type: string
instance_image_id:
description: "Instance AMI ID"
type: string
instance_type:
description: "Instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-integer-benchmarks:
name: Execute signed integer multi-bit benchmarks in EC2
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
steps:
- name: Instance configuration used
run: |
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
- name: Get benchmark date
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
- name: Run multi-bit benchmarks with AVX512
run: |
make FAST_BENCH=TRUE bench_signed_integer_multi_bit
- name: Parse benchmarks to csv
run: |
make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
- name: Parse results
run: |
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware ${{ inputs.instance_type }} \
--project-version "${COMMIT_HASH}" \
--branch ${{ github.ref_name }} \
--commit-date "${COMMIT_DATE}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Signed integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

123
.github/workflows/start_benchmarks.yml vendored Normal file
View File

@@ -0,0 +1,123 @@
# Start all benchmark jobs on Slab CI bot.
name: Start all benchmarks
on:
push:
branches:
- "main"
workflow_dispatch:
inputs:
# The input name must be the name of the slab command to launch
boolean_bench:
description: "Run Boolean benches"
type: boolean
default: true
shortint_bench:
description: "Run shortint benches"
type: boolean
default: true
integer_bench:
description: "Run integer benches"
type: boolean
default: true
signed_integer_bench:
description: "Run signed integer benches"
type: boolean
default: true
integer_multi_bit_bench:
description: "Run integer multi bit benches"
type: boolean
default: true
signed_integer_multi_bit_bench:
description: "Run signed integer multi bit benches"
type: boolean
default: true
core_crypto_bench:
description: "Run core crypto benches"
type: boolean
default: true
jobs:
start-benchmarks:
if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
strategy:
matrix:
command: [ boolean_bench, shortint_bench,
integer_bench, integer_multi_bit_bench,
signed_integer_bench, signed_integer_multi_bit_bench,
core_crypto_bench ]
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@d6babd6899969df1a11d14c368283ea4436bca78
with:
files_yaml: |
common_benches:
- toolchain.txt
- Makefile
- ci/slab.toml
- tfhe/Cargo.toml
- tfhe/src/core_crypto/**
- .github/workflows/start_benchmarks.yml
boolean_bench:
- tfhe/src/boolean/**
- tfhe/benches/boolean/**
- .github/workflows/boolean_benchmark.yml
shortint_bench:
- tfhe/src/shortint/**
- tfhe/benches/shortint/**
- .github/workflows/shortint_benchmark.yml
integer_bench:
- tfhe/src/shortint/**
- tfhe/src/integer/**
- tfhe/benches/integer/bench.rs
- .github/workflows/integer_benchmark.yml
integer_multi_bit_bench:
- tfhe/src/shortint/**
- tfhe/src/integer/**
- tfhe/benches/integer/bench.rs
- .github/workflows/integer_multi_bit_benchmark.yml
signed_integer_bench:
- tfhe/src/shortint/**
- tfhe/src/integer/**
- tfhe/benches/integer/signed_bench.rs
- .github/workflows/signed_integer_benchmark.yml
signed_integer_multi_bit_bench:
- tfhe/src/shortint/**
- tfhe/src/integer/**
- tfhe/benches/integer/signed_bench.rs
- .github/workflows/signed_integer_multi_bit_benchmark.yml
core_crypto_bench:
- tfhe/src/core_crypto/**
- tfhe/benches/core_crypto/**
- .github/workflows/core_crypto_benchmark.yml
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Start AWS job in Slab
# If manually triggered check that the current bench has been requested
# Otherwise if it's on push check that files relevant to benchmarks have changed
if: (github.event_name == 'workflow_dispatch' && github.event.inputs[matrix.command] == 'true') || (github.event_name == 'push' && (steps.changed-files.outputs.common_benches_any_changed == 'true' || steps.changed-files.outputs[format('{0}_any_changed', matrix.command)] == 'true'))
shell: bash
run: |
echo -n '{"command": "${{ matrix.command }}", "git_ref": "${{ github.ref }}", "sha": "${{ github.sha }}"}' > command.json
SIGNATURE="$(slab/scripts/hmac_calculator.sh command.json '${{ secrets.JOB_SECRET }}')"
curl -v -k \
--fail-with-body \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: start_aws" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @command.json \
${{ secrets.SLAB_URL }}

View File

@@ -0,0 +1,66 @@
# Start all benchmark jobs, including full shortint and integer, on Slab CI bot.
name: Start full suite benchmarks
on:
schedule:
# Weekly benchmarks will be triggered each Saturday at 1a.m.
- cron: '0 1 * * 6'
# Quarterly benchmarks will be triggered right before end of quarter, the 25th of the current month at 4a.m.
# These benchmarks are far longer to execute hence the reason to run them only four time a year.
- cron: '0 4 25 MAR,JUN,SEP,DEC *'
workflow_dispatch:
inputs:
benchmark_type:
description: 'Benchmark type'
required: true
default: 'weekly'
type: choice
options:
- weekly
- quarterly
jobs:
start-benchmarks:
if: ${{ (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
strategy:
matrix:
command: [ boolean_bench, shortint_full_bench,
integer_full_bench, signed_integer_full_bench,
core_crypto_bench, wasm_client_bench ]
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Set benchmarks type as weekly
if: (github.event_name == 'workflow_dispatch' && inputs.benchmark_type == 'weekly') || github.event.schedule == '0 1 * * 6'
run: |
echo "BENCH_TYPE=weekly_benchmarks" >> "${GITHUB_ENV}"
- name: Set benchmarks type as quarterly
if: (github.event_name == 'workflow_dispatch' && inputs.benchmark_type == 'quarterly') || github.event.schedule == '0 4 25 MAR,JUN,SEP,DEC *'
run: |
echo "BENCH_TYPE=quarterly_benchmarks" >> "${GITHUB_ENV}"
- name: Start AWS job in Slab
shell: bash
run: |
echo -n '{"command": "${{ matrix.command }}", "git_ref": "${{ github.ref }}", "sha": "${{ github.sha }}", "user_inputs": "${{ env.BENCH_TYPE }}"}' > command.json
SIGNATURE="$(slab/scripts/hmac_calculator.sh command.json '${{ secrets.JOB_SECRET }}')"
curl -v -k \
--fail-with-body \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: start_aws" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @command.json \
${{ secrets.SLAB_URL }}

View File

@@ -25,8 +25,7 @@ jobs:
should-run:
runs-on: ubuntu-latest
if: github.event_name == 'workflow_dispatch' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
((github.event_name == 'push' || github.event_name == 'schedule') && github.repository == 'zama-ai/tfhe-rs')
permissions:
pull-requests: write
outputs:
@@ -39,7 +38,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
uses: tj-actions/changed-files@d6babd6899969df1a11d14c368283ea4436bca78
with:
since_last_remote_commit: true
files_yaml: |
@@ -54,8 +53,7 @@ jobs:
setup-instance:
name: Setup instance (wasm-client-benchmarks)
if: github.event_name == 'workflow_dispatch' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
if: github.event_name != 'push' ||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.wasm_bench)
needs: should-run
runs-on: ubuntu-latest
@@ -64,7 +62,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -75,8 +73,9 @@ jobs:
wasm-client-benchmarks:
name: Execute WASM client benchmarks
needs: setup-instance
if: needs.setup-instance.result != 'skipped'
needs: [ should-run, setup-instance ]
if: github.event_name != 'push' ||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.wasm_bench)
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs repo with tags
@@ -98,14 +97,14 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
with:
toolchain: nightly
- name: Run benchmarks
run: |
make install_node
make bench_web_js_api_parallel_ci
make ci_bench_web_js_api_parallel
- name: Parse results
run: |
@@ -130,7 +129,7 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
with:
name: ${{ github.sha }}_wasm
path: ${{ env.RESULTS_FILENAME }}
@@ -172,7 +171,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

7
.gitignore vendored
View File

@@ -7,7 +7,6 @@ target/
# In case of symlinked keys
/keys
**/*.rmeta
**/Cargo.lock
**/*.bin
@@ -23,9 +22,3 @@ dieharder_run.log
# Cuda local build
backends/tfhe-cuda-backend/cuda/cmake-build-debug/
# WASM tests
tfhe/web_wasm_parallel_tests/server.PID
# Dir used for backward compatibility test data
tfhe/tfhe-backward-compat-data/

View File

@@ -7,14 +7,6 @@ members = [
"apps/trivium",
"concrete-csprng",
"backends/tfhe-cuda-backend",
"utils/tfhe-versionable",
"utils/tfhe-versionable-derive",
]
exclude = [
"tfhe/backward_compatibility_tests",
"utils/cargo-tfhe-lints-inner",
"utils/cargo-tfhe-lints"
]
[profile.bench]

220
Makefile
View File

@@ -16,15 +16,19 @@ GEN_KEY_CACHE_COVERAGE_ONLY?=FALSE
PARSE_INTEGER_BENCH_CSV_FILE?=tfhe_rs_integer_benches.csv
FAST_TESTS?=FALSE
FAST_BENCH?=FALSE
NIGHTLY_TESTS?=FALSE
BENCH_OP_FLAVOR?=DEFAULT
NODE_VERSION=22.6
NODE_VERSION=20
FORWARD_COMPAT?=OFF
BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
BACKWARD_COMPAT_DATA_BRANCH?=v0.1
BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
TFHE_SPEC:=tfhe
# sed: -n, do not print input stream, -e means a script/expression
# 1,/version/ indicates from the first line, to the line matching version at the start of the line
# p indicates to print, so we keep only the start of the Cargo.toml until we hit the first version
# entry which should be the version of tfhe
TFHE_CURRENT_VERSION:=\
$(shell sed -n -e '1,/^version/p' tfhe/Cargo.toml | \
grep '^version[[:space:]]*=' | cut -d '=' -f 2 | xargs)
# Cargo has a hard time distinguishing between our package from the workspace and a package that
# could be a dependency, so we build an unambiguous spec here
TFHE_SPEC:=tfhe@$(TFHE_CURRENT_VERSION)
# This is done to avoid forgetting it, we still precise the RUSTFLAGS in the commands to be able to
# copy paste the command in the terminal and change them if required without forgetting the flags
export RUSTFLAGS?=-C target-cpu=native
@@ -111,7 +115,7 @@ install_cargo_nextest: install_rs_build_toolchain
.PHONY: install_wasm_pack # Install wasm-pack to build JS packages
install_wasm_pack: install_rs_build_toolchain
@wasm-pack --version > /dev/null 2>&1 || \
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked wasm-pack@0.13.0 || \
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install wasm-pack || \
( echo "Unable to install cargo wasm-pack, unknown error." && exit 1 )
.PHONY: install_node # Install last version of NodeJS via nvm
@@ -141,11 +145,6 @@ install_tarpaulin: install_rs_build_toolchain
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install cargo-tarpaulin --locked || \
( echo "Unable to install cargo tarpaulin, unknown error." && exit 1 )
.PHONY: install_tfhe_lints # Install custom tfhe-rs lints
install_tfhe_lints:
(cd utils/cargo-tfhe-lints-inner && cargo install --path .) && \
cd utils/cargo-tfhe-lints && cargo install --path .
.PHONY: check_linelint_installed # Check if linelint newline linter is installed
check_linelint_installed:
@printf "\n" | linelint - > /dev/null 2>&1 || \
@@ -265,17 +264,6 @@ clippy: install_rs_check_toolchain
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer \
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: clippy_rustdoc # Run clippy lints on doctests enabling the boolean, shortint, integer and zk-pok
clippy_rustdoc: install_rs_check_toolchain
if [[ "$(OS)" != "Linux" && "$(OS)" != "Darwin" ]]; then \
echo "WARNING: skipped clippy_rustdoc, unsupported OS $(OS)"; \
exit 0; \
fi && \
CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --nocapture --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,zk-pok,pbs-stats \
-p $(TFHE_SPEC)
.PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
clippy_c_api: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
@@ -285,7 +273,7 @@ clippy_c_api: install_rs_check_toolchain
.PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
clippy_js_wasm_api: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api \
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api \
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: clippy_tasks # Run clippy lints on helper tasks crate.
@@ -301,7 +289,7 @@ clippy_trivium: install_rs_check_toolchain
.PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
clippy_all_targets: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok-experimental \
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
@@ -316,23 +304,18 @@ clippy_zk_pok: install_rs_check_toolchain
-p tfhe-zk-pok -- --no-deps -D warnings
.PHONY: clippy_all # Run all clippy targets
clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium
clippy_all: clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets clippy_c_api \
clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium
.PHONY: clippy_fast # Run main clippy targets
clippy_fast: clippy_rustdoc clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks \
clippy_core clippy_concrete_csprng
clippy_fast: clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core \
clippy_concrete_csprng
.PHONY: clippy_cuda_backend # Run clippy lints on the tfhe-cuda-backend
clippy_cuda_backend: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-cuda-backend -- --no-deps -D warnings
.PHONY: tfhe_lints # Run custom tfhe-rs lints
tfhe_lints: install_tfhe_lints
cd tfhe && RUSTFLAGS="$(RUSTFLAGS)" cargo tfhe-lints \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -- -D warnings
.PHONY: build_core # Build core_crypto without experimental features
build_core: install_rs_build_toolchain install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
@@ -385,21 +368,21 @@ symlink_c_libs_without_fingerprint:
.PHONY: build_c_api # Build the C API for boolean, shortint and integer
build_c_api: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,$(FORWARD_COMPAT_FEATURE) \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,$(FORWARD_COMPAT_FEATURE) \
-p $(TFHE_SPEC)
@"$(MAKE)" symlink_c_libs_without_fingerprint
.PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
build_c_api_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,gpu \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,gpu \
-p $(TFHE_SPEC)
@"$(MAKE)" symlink_c_libs_without_fingerprint
.PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
-p $(TFHE_SPEC)
@"$(MAKE)" symlink_c_libs_without_fingerprint
@@ -408,7 +391,7 @@ build_web_js_api: install_rs_build_toolchain install_wasm_pack
cd tfhe && \
RUSTFLAGS="$(WASM_RUSTFLAGS)" rustup run "$(RS_BUILD_TOOLCHAIN)" \
wasm-pack build --release --target=web \
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok-experimental
.PHONY: build_web_js_api_parallel # Build the js API targeting the web browser with parallelism support
build_web_js_api_parallel: install_rs_check_toolchain install_wasm_pack
@@ -416,16 +399,15 @@ build_web_js_api_parallel: install_rs_check_toolchain install_wasm_pack
rustup component add rust-src --toolchain $(RS_CHECK_TOOLCHAIN) && \
RUSTFLAGS="$(WASM_RUSTFLAGS) -C target-feature=+atomics,+bulk-memory,+mutable-globals" rustup run $(RS_CHECK_TOOLCHAIN) \
wasm-pack build --release --target=web \
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,parallel-wasm-api,zk-pok \
-Z build-std=panic_abort,std && \
find pkg/snippets -type f -iname workerHelpers.worker.js -exec sed -i "s|from '..\/..\/..\/';|from '..\/..\/..\/tfhe.js';|" {} \;
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,parallel-wasm-api,zk-pok-experimental \
-Z build-std=panic_abort,std
.PHONY: build_node_js_api # Build the js API targeting nodejs
build_node_js_api: install_rs_build_toolchain install_wasm_pack
cd tfhe && \
RUSTFLAGS="$(WASM_RUSTFLAGS)" rustup run "$(RS_BUILD_TOOLCHAIN)" \
wasm-pack build --release --target=nodejs \
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok-experimental
.PHONY: build_concrete_csprng # Build concrete_csprng
build_concrete_csprng: install_rs_build_toolchain
@@ -435,10 +417,10 @@ build_concrete_csprng: install_rs_build_toolchain
.PHONY: test_core_crypto # Run the tests of the core_crypto module including experimental ones
test_core_crypto: install_rs_build_toolchain install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok -p $(TFHE_SPEC) -- core_crypto::
--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok-experimental -p $(TFHE_SPEC) -- core_crypto::
@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok,$(AVX512_FEATURE) -p $(TFHE_SPEC) -- core_crypto::; \
--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok-experimental,$(AVX512_FEATURE) -p $(TFHE_SPEC) -- core_crypto::; \
fi
.PHONY: test_core_crypto_cov # Run the tests of the core_crypto module with code coverage
@@ -461,8 +443,8 @@ test_cuda_backend:
mkdir -p "$(TFHECUDA_BUILD)" && \
cd "$(TFHECUDA_BUILD)" && \
cmake .. -DCMAKE_BUILD_TYPE=Release -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON && \
"$(MAKE)" -j "$(CPU_COUNT)" && \
"$(MAKE)" test
make -j "$(CPU_COUNT)" && \
make test
.PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
@@ -481,60 +463,6 @@ test_integer_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
.PHONY: test_integer_gpu_ci # Run the tests for integer ci on gpu backend
test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
FAST_TESTS="$(FAST_TESTS)" \
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)" --backend "gpu" \
--tfhe-package "$(TFHE_SPEC)"
.PHONY: test_unsigned_integer_gpu_ci # Run the tests for unsigned integer ci on gpu backend
test_unsigned_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
FAST_TESTS="$(FAST_TESTS)" \
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)" --backend "gpu" \
--unsigned-only --tfhe-package "$(TFHE_SPEC)"
.PHONY: test_signed_integer_gpu_ci # Run the tests for signed integer ci on gpu backend
test_signed_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
FAST_TESTS="$(FAST_TESTS)" \
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)" --backend "gpu" \
--signed-only --tfhe-package "$(TFHE_SPEC)"
.PHONY: test_integer_multi_bit_gpu_ci # Run the tests for integer ci on gpu backend running only multibit tests
test_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
FAST_TESTS="$(FAST_TESTS)" \
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \
--tfhe-package "$(TFHE_SPEC)"
.PHONY: test_unsigned_integer_multi_bit_gpu_ci # Run the tests for unsigned integer ci on gpu backend running only multibit tests
test_unsigned_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
FAST_TESTS="$(FAST_TESTS)" \
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \
--unsigned-only --tfhe-package "$(TFHE_SPEC)"
.PHONY: test_signed_integer_multi_bit_gpu_ci # Run the tests for signed integer ci on gpu backend running only multibit tests
test_signed_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
FAST_TESTS="$(FAST_TESTS)" \
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \
--signed-only --tfhe-package "$(TFHE_SPEC)"
.PHONY: test_boolean # Run the tests of the boolean module
test_boolean: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -597,7 +525,6 @@ test_shortint_cov: install_rs_check_toolchain install_tarpaulin
test_integer_ci: install_rs_check_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
FAST_TESTS="$(FAST_TESTS)" \
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
--tfhe-package "$(TFHE_SPEC)"
@@ -606,7 +533,6 @@ test_integer_ci: install_rs_check_toolchain install_cargo_nextest
test_unsigned_integer_ci: install_rs_check_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
FAST_TESTS="$(FAST_TESTS)" \
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
--unsigned-only --tfhe-package "$(TFHE_SPEC)"
@@ -615,7 +541,6 @@ test_unsigned_integer_ci: install_rs_check_toolchain install_cargo_nextest
test_signed_integer_ci: install_rs_check_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
FAST_TESTS="$(FAST_TESTS)" \
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
--signed-only --tfhe-package "$(TFHE_SPEC)"
@@ -624,25 +549,22 @@ test_signed_integer_ci: install_rs_check_toolchain install_cargo_nextest
test_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
FAST_TESTS="$(FAST_TESTS)" \
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
--tfhe-package "$(TFHE_SPEC)"
.PHONY: test_unsigned_integer_multi_bit_ci # Run the tests for unsigned integer ci running only multibit tests
.PHONY: test_unsigned_integer_multi_bit_ci # Run the tests for nsigned integer ci running only multibit tests
test_unsigned_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
FAST_TESTS="$(FAST_TESTS)" \
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
--unsigned-only --tfhe-package "$(TFHE_SPEC)"
.PHONY: test_signed_integer_multi_bit_ci # Run the tests for signed integer ci running only multibit tests
.PHONY: test_signed_integer_multi_bit_ci # Run the tests for nsigned integer ci running only multibit tests
test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
FAST_TESTS="$(FAST_TESTS)" \
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
--signed-only --tfhe-package "$(TFHE_SPEC)"
@@ -669,7 +591,7 @@ test_integer_cov: install_rs_check_toolchain install_tarpaulin
.PHONY: test_high_level_api # Run all the tests for high_level_api
test_high_level_api: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok -p $(TFHE_SPEC) \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok-experimental -p $(TFHE_SPEC) \
-- high_level_api::
test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
@@ -680,14 +602,14 @@ test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
.PHONY: test_user_doc # Run tests from the .md documentation
test_user_doc: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok-experimental \
-p $(TFHE_SPEC) \
-- test_user_docs::
.PHONY: test_user_doc_gpu # Run tests for GPU from the .md documentation
test_user_doc_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu,zk-pok -p $(TFHE_SPEC) \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu,zk-pok-experimental -p $(TFHE_SPEC) \
-- test_user_docs::
.PHONY: test_fhe_strings # Run tests for fhe_strings example
@@ -726,38 +648,18 @@ test_concrete_csprng: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng
.PHONY: test_zk_pok # Run tfhe-zk-pok tests
.PHONY: test_zk_pok # Run tfhe-zk-pok-experimental tests
test_zk_pok: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-p tfhe-zk-pok
.PHONY: test_versionable # Run tests for tfhe-versionable subcrate
test_versionable: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-p tfhe-versionable
# The backward compat data repo holds historical binary data but also rust code to generate and load them.
# Here we use the "patch" functionality of Cargo to make sure the repo used for the data is the same as the one used for the code.
.PHONY: test_backward_compatibility_ci
test_backward_compatibility_ci: install_rs_build_toolchain
TFHE_BACKWARD_COMPAT_DATA_DIR="$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).path=\"tfhe/$(BACKWARD_COMPAT_DATA_DIR)\"" \
--features=$(TARGET_ARCH_FEATURE),shortint,integer -p $(TFHE_SPEC) test_backward_compatibility -- --nocapture
.PHONY: test_backward_compatibility # Same as test_backward_compatibility_ci but tries to clone the data repo first if needed
test_backward_compatibility: tfhe/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci
.PHONY: backward_compat_branch # Prints the required backward compatibility branch
backward_compat_branch:
@echo "$(BACKWARD_COMPAT_DATA_BRANCH)"
.PHONY: doc # Build rust doc
doc: install_rs_check_toolchain
@# Even though we are not in docs.rs, this allows to "just" build the doc
DOCS_RS=1 \
RUSTDOCFLAGS="--html-in-header katex-header.html" \
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,gpu,internal-keycache,experimental,zk-pok --no-deps -p $(TFHE_SPEC)
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,gpu,internal-keycache,experimental --no-deps -p $(TFHE_SPEC)
.PHONY: docs # Build rust doc alias for doc
docs: doc
@@ -768,7 +670,7 @@ lint_doc: install_rs_check_toolchain
DOCS_RS=1 \
RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,gpu,internal-keycache,experimental,zk-pok -p $(TFHE_SPEC) --no-deps
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,gpu,internal-keycache,experimental -p $(TFHE_SPEC) --no-deps
.PHONY: lint_docs # Build rust doc with linting enabled alias for lint_doc
lint_docs: lint_doc
@@ -813,7 +715,7 @@ check_compile_tests_benches_gpu: install_rs_build_toolchain
mkdir -p "$(TFHECUDA_BUILD)" && \
cd "$(TFHECUDA_BUILD)" && \
cmake .. -DCMAKE_BUILD_TYPE=Debug -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON -DTFHE_CUDA_BACKEND_BUILD_BENCHMARKS=ON && \
"$(MAKE)" -j "$(CPU_COUNT)"
make -j "$(CPU_COUNT)"
.PHONY: build_nodejs_test_docker # Build a docker image with tools to run nodejs tests for wasm API
build_nodejs_test_docker:
@@ -833,14 +735,14 @@ test_nodejs_wasm_api_in_docker: build_nodejs_test_docker
.PHONY: test_nodejs_wasm_api # Run tests for the nodejs on wasm API
test_nodejs_wasm_api: build_node_js_api
cd tfhe/js_on_wasm_tests && npm run test
cd tfhe && node --test js_on_wasm_tests
.PHONY: test_web_js_api_parallel # Run tests for the web wasm api
test_web_js_api_parallel: build_web_js_api_parallel
$(MAKE) -C tfhe/web_wasm_parallel_tests test
.PHONY: test_web_js_api_parallel_ci # Run tests for the web wasm api
test_web_js_api_parallel_ci: build_web_js_api_parallel
.PHONY: ci_test_web_js_api_parallel # Run tests for the web wasm api
ci_test_web_js_api_parallel: build_web_js_api_parallel
source ~/.nvm/nvm.sh && \
nvm install $(NODE_VERSION) && \
nvm use $(NODE_VERSION) && \
@@ -907,22 +809,6 @@ bench_integer_multi_bit_gpu: install_rs_check_toolchain
--bench integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
.PHONY: bench_unsigned_integer_multi_bit_gpu # Run benchmarks for unsigned integer on GPU backend using multi-bit parameters
bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- ::unsigned
.PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
bench_integer_zk: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench zk-pke-bench \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,zk-pok,nightly-avx512 \
-p $(TFHE_SPEC) --
.PHONY: bench_shortint # Run benchmarks for shortint
bench_shortint: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
@@ -930,12 +816,16 @@ bench_shortint: install_rs_check_toolchain
--bench shortint-bench \
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
.PHONY: bench_shortint_oprf # Run benchmarks for shortint
bench_shortint_oprf: install_rs_check_toolchain
.PHONY: bench_oprf # Run benchmarks for shortint
bench_oprf: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench oprf-shortint-bench \
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
RUSTFLAGS="$(RUSTFLAGS)" \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench oprf-integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
.PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
bench_shortint_multi_bit: install_rs_check_toolchain
@@ -965,7 +855,7 @@ bench_pbs128: install_rs_check_toolchain
.PHONY: bench_pbs_gpu # Run benchmarks for PBS on GPU backend
bench_pbs_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_FAST_BENCH=$(FAST_BENCH) cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench pbs-bench \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
@@ -985,10 +875,10 @@ bench_ks_gpu: install_rs_check_toolchain
bench_web_js_api_parallel: build_web_js_api_parallel
$(MAKE) -C tfhe/web_wasm_parallel_tests bench
.PHONY: bench_web_js_api_parallel_ci # Run benchmarks for the web wasm api
bench_web_js_api_parallel_ci: build_web_js_api_parallel
.PHONY: ci_bench_web_js_api_parallel # Run benchmarks for the web wasm api
ci_bench_web_js_api_parallel: build_web_js_api_parallel
source ~/.nvm/nvm.sh && \
nvm use $(NODE_VERSION) && \
nvm use node && \
$(MAKE) -C tfhe/web_wasm_parallel_tests bench-ci
#
@@ -1045,12 +935,6 @@ write_params_to_file: install_rs_check_toolchain
--example write_params_to_file \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache
.PHONY: clone_backward_compat_data # Clone the data repo needed for backward compatibility tests
clone_backward_compat_data:
./scripts/clone_backward_compat_data.sh $(BACKWARD_COMPAT_DATA_URL) $(BACKWARD_COMPAT_DATA_BRANCH) tfhe/$(BACKWARD_COMPAT_DATA_DIR)
tfhe/$(BACKWARD_COMPAT_DATA_DIR): clone_backward_compat_data
#
# Real use case examples
#
@@ -1077,7 +961,7 @@ sha256_bool: install_rs_check_toolchain
.PHONY: pcc # pcc stands for pre commit checks (except GPU)
pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested check_intra_md_links \
clippy_all tfhe_lints check_compile_tests
clippy_all check_compile_tests
.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu

View File

@@ -159,7 +159,7 @@ To run this code, use the following command:
> Note that when running code that uses `TFHE-rs`, it is highly recommended
to run in release mode with cargo's `--release` flag to have the best performances possible.
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick_start)*
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/getting-started/quick_start)*
<p align="right">
<a href="#about" > ↑ Back to top </a>
@@ -208,7 +208,7 @@ When a new update is published in the Lattice Estimator, we update parameters ac
### Security model
The default parameters for the TFHE-rs library are chosen considering the IND-CPA security model, and are selected with a bootstrapping failure probability fixed at p_error = $2^{-64}$. In particular, it is assumed that the results of decrypted computations are not shared by the secret key owner with any third parties, as such an action can lead to leakage of the secret encryption key. If you are designing an application where decryptions must be shared, you will need to craft custom encryption parameters which are chosen in consideration of the IND-CPA^D security model [1].
The default parameters for the TFHE-rs library are chosen considering the IND-CPA security model, and are selected with a bootstrapping failure probability fixed at p_error = $2^{-40}$. In particular, it is assumed that the results of decrypted computations are not shared by the secret key owner with any third parties, as such an action can lead to leakage of the secret encryption key. If you are designing an application where decryptions must be shared, you will need to craft custom encryption parameters which are chosen in consideration of the IND-CPA^D security model [1].
[1] Li, Baiyu, et al. "Securing approximate homomorphic encryption using differential privacy." Annual International Cryptology Conference. Cham: Springer Nature Switzerland, 2022. https://eprint.iacr.org/2022/816.pdf

View File

@@ -4,8 +4,9 @@ use tfhe::{generate_keys, ConfigBuilder, FheUint64, FheUint8};
use tfhe_trivium::{KreyviumStreamByte, TransCiphering};
pub fn kreyvium_byte_gen(c: &mut Criterion) {
let config = ConfigBuilder::default().build();
let config = ConfigBuilder::default()
.enable_function_evaluation()
.build();
let (client_key, server_key) = generate_keys(config);
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -32,8 +33,9 @@ pub fn kreyvium_byte_gen(c: &mut Criterion) {
}
pub fn kreyvium_byte_trans(c: &mut Criterion) {
let config = ConfigBuilder::default().build();
let config = ConfigBuilder::default()
.enable_function_evaluation()
.build();
let (client_key, server_key) = generate_keys(config);
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -61,8 +63,9 @@ pub fn kreyvium_byte_trans(c: &mut Criterion) {
}
pub fn kreyvium_byte_warmup(c: &mut Criterion) {
let config = ConfigBuilder::default().build();
let config = ConfigBuilder::default()
.enable_function_evaluation()
.build();
let (client_key, server_key) = generate_keys(config);
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();

View File

@@ -13,7 +13,7 @@ pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&client_key, &server_key),
(&underlying_ck, &underlying_sk),
PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
);
@@ -63,7 +63,7 @@ pub fn kreyvium_shortint_gen(c: &mut Criterion) {
let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&client_key, &server_key),
(&underlying_ck, &underlying_sk),
PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
);
@@ -108,7 +108,7 @@ pub fn kreyvium_shortint_trans(c: &mut Criterion) {
let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&client_key, &server_key),
(&underlying_ck, &underlying_sk),
PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
);

View File

@@ -13,7 +13,7 @@ pub fn trivium_shortint_warmup(c: &mut Criterion) {
let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&client_key, &server_key),
(&underlying_ck, &underlying_sk),
PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
);
@@ -63,7 +63,7 @@ pub fn trivium_shortint_gen(c: &mut Criterion) {
let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&client_key, &server_key),
(&underlying_ck, &underlying_sk),
PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
);
@@ -108,7 +108,7 @@ pub fn trivium_shortint_trans(c: &mut Criterion) {
let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&client_key, &server_key),
(&underlying_ck, &underlying_sk),
PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
);

View File

@@ -148,9 +148,10 @@ where
/// Computes one turn of the stream, updating registers and outputting the new bit.
pub fn next_bool(&mut self) -> T {
if let Some(sk) = &self.fhe_key {
set_server_key(sk.clone());
}
match &self.fhe_key {
Some(sk) => set_server_key(sk.clone()),
None => (),
};
let [o, a, b, c] = self.get_output_and_values(0);
@@ -225,12 +226,18 @@ where
/// Computes 64 turns of the stream, outputting the 64 bits all at once in a
/// Vec (first value is oldest, last is newest)
pub fn next_64(&mut self) -> Vec<T> {
if let Some(sk) = &self.fhe_key {
rayon::broadcast(|_| set_server_key(sk.clone()));
match &self.fhe_key {
Some(sk) => {
rayon::broadcast(|_| set_server_key(sk.clone()));
}
None => (),
}
let mut values = self.get_64_output_and_values();
if self.fhe_key.is_some() {
rayon::broadcast(|_| unset_server_key());
match &self.fhe_key {
Some(_) => {
rayon::broadcast(|_| unset_server_key());
}
None => (),
}
let mut ret = Vec::<T>::with_capacity(64);

View File

@@ -119,7 +119,7 @@ impl KreyviumStreamByte<FheUint8> {
}
// Key and iv are stored in reverse in their shift registers
let mut key = key_bytes.map(|b| b.reverse_bits());
let mut key = key_bytes.map(|b| b.map(|x| (x as u8).reverse_bits() as u64));
let mut iv = iv_bytes.map(|x| FheUint8::encrypt_trivial(x.reverse_bits()));
key.reverse();
iv.reverse();
@@ -237,12 +237,18 @@ where
/// Computes 64 turns of the stream, outputting the 64 bits (in 8 bytes) all at once in a
/// Vec (first value is oldest, last is newest)
pub fn next_64(&mut self) -> Vec<T> {
if let Some(sk) = &self.fhe_key {
rayon::broadcast(|_| set_server_key(sk.clone()));
match &self.fhe_key {
Some(sk) => {
rayon::broadcast(|_| set_server_key(sk.clone()));
}
None => (),
}
let values = self.get_64_output_and_values();
if self.fhe_key.is_some() {
rayon::broadcast(|_| unset_server_key());
match &self.fhe_key {
Some(_) => {
rayon::broadcast(|_| unset_server_key());
}
None => (),
}
let mut bytes = Vec::<T>::with_capacity(8);

View File

@@ -224,7 +224,7 @@ fn kreyvium_test_shortint_long() {
let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&client_key, &server_key),
(&underlying_ck, &underlying_sk),
PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
);
@@ -299,8 +299,9 @@ fn kreyvium_test_clear_byte() {
#[test]
fn kreyvium_test_byte_long() {
let config = ConfigBuilder::default().build();
let config = ConfigBuilder::default()
.enable_function_evaluation()
.build();
let (client_key, server_key) = generate_keys(config);
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -337,8 +338,9 @@ fn kreyvium_test_byte_long() {
#[test]
fn kreyvium_test_fhe_byte_transciphering_long() {
let config = ConfigBuilder::default().build();
let config = ConfigBuilder::default()
.enable_function_evaluation()
.build();
let (client_key, server_key) = generate_keys(config);
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();

View File

@@ -360,7 +360,7 @@ fn trivium_test_shortint_long() {
let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&client_key, &server_key),
(&underlying_ck, &underlying_sk),
PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
);

View File

@@ -120,9 +120,10 @@ where
/// Computes one turn of the stream, updating registers and outputting the new bit.
pub fn next_bool(&mut self) -> T {
if let Some(sk) = &self.fhe_key {
set_server_key(sk.clone());
}
match &self.fhe_key {
Some(sk) => set_server_key(sk.clone()),
None => (),
};
let [o, a, b, c] = self.get_output_and_values(0);
@@ -195,12 +196,18 @@ where
/// Computes 64 turns of the stream, outputting the 64 bits all at once in a
/// Vec (first value is oldest, last is newest)
pub fn next_64(&mut self) -> Vec<T> {
if let Some(sk) = &self.fhe_key {
rayon::broadcast(|_| set_server_key(sk.clone()));
match &self.fhe_key {
Some(sk) => {
rayon::broadcast(|_| set_server_key(sk.clone()));
}
None => (),
}
let mut values = self.get_64_output_and_values();
if self.fhe_key.is_some() {
rayon::broadcast(|_| unset_server_key());
match &self.fhe_key {
Some(_) => {
rayon::broadcast(|_| unset_server_key());
}
None => (),
}
let mut ret = Vec::<T>::with_capacity(64);

View File

@@ -187,12 +187,18 @@ where
/// Computes 64 turns of the stream, outputting the 64 bits (in 8 bytes) all at once in a
/// Vec (first value is oldest, last is newest)
pub fn next_64(&mut self) -> Vec<T> {
if let Some(sk) = &self.fhe_key {
rayon::broadcast(|_| set_server_key(sk.clone()));
match &self.fhe_key {
Some(sk) => {
rayon::broadcast(|_| set_server_key(sk.clone()));
}
None => (),
}
let values = self.get_64_output_and_values();
if self.fhe_key.is_some() {
rayon::broadcast(|_| unset_server_key());
match &self.fhe_key {
Some(_) => {
rayon::broadcast(|_| unset_server_key());
}
None => (),
}
let mut bytes = Vec::<T>::with_capacity(8);

View File

@@ -1,6 +1,6 @@
[package]
name = "tfhe-cuda-backend"
version = "0.4.0-alpha.0"
version = "0.2.0"
edition = "2021"
authors = ["Zama team"]
license = "BSD-3-Clause-Clear"

View File

@@ -8,18 +8,6 @@ fn main() {
}
}
// This is a workaround to the current nightly toolchain (2024-06-27 which started with
// toolchain 2024-05-05) build issue
// Essentially if cbindgen is running, a wrong argument ends up forwarded to the cuda backend
// "make" command during macro expansions for TFHE-rs C API, crashing make for make < 4.4 and
// thus crashing the build
// On the other hand, this speeds up C API build greatly given we don't have macro expansions
// in the CUDA backend so this skips the second compilation of TFHE-rs for macro inspection by
// cbindgen
if std::env::var("_CBINDGEN_IS_RUNNING").is_ok() {
return;
}
println!("Build tfhe-cuda-backend");
println!("cargo::rerun-if-changed=cuda/include");
println!("cargo::rerun-if-changed=cuda/src");

View File

@@ -1,7 +1,6 @@
#ifndef CUDA_CIPHERTEXT_H
#define CUDA_CIPHERTEXT_H
#include "device.h"
#include <cstdint>
extern "C" {
@@ -15,11 +14,5 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
void *dest, void *src,
uint32_t number_of_cts,
uint32_t lwe_dimension);
void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
void *lwe_array_out, void *glwe_array_in,
uint32_t *nth_array, uint32_t num_glwes,
uint32_t glwe_dimension,
uint32_t polynomial_size);
};
#endif

View File

@@ -1,156 +0,0 @@
#ifndef CUDA_INTEGER_COMPRESSION_H
#define CUDA_INTEGER_COMPRESSION_H
#include "integer.h"
extern "C" {
void scratch_cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
bool allocate_gpu_memory);
void scratch_cuda_integer_decompress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory);
void cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
int8_t *mem_ptr);
void cuda_integer_decompress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *glwe_in, void *indexes_array,
uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr);
void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void cleanup_cuda_integer_decompress_radix_ciphertext_64(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
}
template <typename Torus> struct int_compression {
int_radix_params compression_params;
uint32_t storage_log_modulus;
uint32_t lwe_per_glwe;
uint32_t body_count;
// Compression
int8_t *fp_ks_buffer;
Torus *tmp_lwe;
Torus *tmp_glwe_array_out;
int_compression(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
this->compression_params = compression_params;
this->lwe_per_glwe = lwe_per_glwe;
this->storage_log_modulus = storage_log_modulus;
this->body_count = num_radix_blocks;
if (allocate_gpu_memory) {
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
tmp_lwe = (Torus *)cuda_malloc_async(
num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
tmp_glwe_array_out = (Torus *)cuda_malloc_async(
glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
scratch_packing_keyswitch_lwe_list_to_glwe_64(
streams[0], gpu_indexes[0], &fp_ks_buffer,
compression_params.glwe_dimension, compression_params.polynomial_size,
num_radix_blocks, true);
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_async(tmp_lwe, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_glwe_array_out, streams[0], gpu_indexes[0]);
cleanup_packing_keyswitch_lwe_list_to_glwe(streams[0], gpu_indexes[0],
&fp_ks_buffer);
}
};
template <typename Torus> struct int_decompression {
int_radix_params encryption_params;
int_radix_params compression_params;
uint32_t storage_log_modulus;
uint32_t body_count;
Torus *tmp_extracted_glwe;
Torus *tmp_extracted_lwe;
int_radix_lut<Torus> *carry_extract_lut;
int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params encryption_params,
int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t storage_log_modulus,
bool allocate_gpu_memory) {
this->encryption_params = encryption_params;
this->compression_params = compression_params;
this->storage_log_modulus = storage_log_modulus;
this->body_count = num_radix_blocks;
if (allocate_gpu_memory) {
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
carry_extract_lut = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, encryption_params, 1,
num_radix_blocks, allocate_gpu_memory);
tmp_extracted_glwe = (Torus *)cuda_malloc_async(
glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
tmp_extracted_lwe = (Torus *)cuda_malloc_async(
num_radix_blocks *
(compression_params.glwe_dimension *
compression_params.polynomial_size +
1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
// Decompression
// Carry extract LUT
auto carry_extract_f = [encryption_params](Torus x) -> Torus {
return x / encryption_params.message_modulus;
};
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0],
carry_extract_lut->get_lut(gpu_indexes[0], 0),
encryption_params.glwe_dimension, encryption_params.polynomial_size,
encryption_params.message_modulus, encryption_params.carry_modulus,
carry_extract_f);
carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]);
carry_extract_lut->release(streams, gpu_indexes, gpu_count);
delete (carry_extract_lut);
}
};
#endif

View File

@@ -64,8 +64,14 @@ void cuda_drop(void *ptr, uint32_t gpu_index);
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
int cuda_get_max_shared_memory(uint32_t gpu_index);
void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
cudaStreamCallback_t callback, void *user_data);
}
void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
void *host_pointer);
template <typename Torus>
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
Torus *d_array, Torus value, Torus n);

View File

@@ -1,8 +1,6 @@
#ifndef HELPER_MULTI_GPU_H
#define HELPER_MULTI_GPU_H
#include <mutex>
#include <variant>
#include <vector>
extern std::mutex m;
extern bool p2p_enabled;
@@ -11,20 +9,6 @@ extern "C" {
int cuda_setup_multi_gpu();
}
// Define a variant type that can be either a vector or a single pointer
template <typename Torus>
using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
// Macro to define the visitor logic using std::holds_alternative for vectors
#define GET_VARIANT_ELEMENT(variant, index) \
[&] { \
if (std::holds_alternative<std::vector<Torus *>>(variant)) { \
return std::get<std::vector<Torus *>>(variant)[index]; \
} else { \
return std::get<Torus *>(variant); \
} \
}()
int get_active_gpu_count(int num_inputs, int gpu_count);
int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);

File diff suppressed because it is too large Load Diff

View File

@@ -9,28 +9,15 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t gpu_offset = 0);
void cuda_keyswitch_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
void scratch_packing_keyswitch_lwe_list_to_glwe_64(
void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
bool allocate_gpu_memory);
void cuda_packing_keyswitch_lwe_list_to_glwe_64(
void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_lwes);
void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
uint32_t gpu_index,
int8_t **fp_ks_buffer);
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t gpu_offset = 0);
}
#endif // CNCRT_KS_H_

View File

@@ -26,12 +26,14 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
void scratch_cuda_programmable_bootstrap_amortized_32(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void scratch_cuda_programmable_bootstrap_amortized_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -39,7 +41,8 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples);
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory);
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -47,7 +50,8 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples);
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t gpu_offset = 0);
void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
uint32_t gpu_index,
@@ -56,12 +60,14 @@ void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
void scratch_cuda_programmable_bootstrap_32(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void scratch_cuda_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -69,7 +75,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples);
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t gpu_offset = 0);
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -77,41 +84,44 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples);
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t gpu_offset = 0);
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
int8_t **pbs_buffer);
uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count);
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
uint64_t get_buffer_size_programmable_bootstrap_64(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count);
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
}
template <typename Torus>
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_one(
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_step_one(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_two(
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_step_two(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
uint64_t
__host__ __device__ uint64_t
get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
uint64_t
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(Torus) * polynomial_size + // accumulator
@@ -119,19 +129,21 @@ get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
}
template <typename Torus>
uint64_t get_buffer_size_partial_sm_programmable_bootstrap_tbc(
__host__ __device__ uint64_t
get_buffer_size_partial_sm_programmable_bootstrap_tbc(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
}
template <typename Torus>
uint64_t get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
__host__ __device__ uint64_t
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // tbc
}
template <typename Torus>
uint64_t
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(Torus) * polynomial_size + // accumulator
@@ -139,14 +151,15 @@ get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
}
template <typename Torus>
uint64_t
__host__ __device__ uint64_t
get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
}
template <typename Torus>
bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
uint32_t polynomial_size);
__host__ bool
supports_distributed_shared_memory_on_classic_programmable_bootstrap(
uint32_t polynomial_size, uint32_t max_shared_memory);
template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
@@ -165,7 +178,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
this->pbs_variant = pbs_variant;
auto max_shared_memory = cuda_get_max_shared_memory(0);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
if (allocate_gpu_memory) {
switch (pbs_variant) {
@@ -242,7 +255,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
bool supports_dsm =
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
Torus>(polynomial_size);
Torus>(polynomial_size, max_shared_memory);
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
@@ -301,10 +314,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
};
template <typename Torus>
uint64_t get_buffer_size_programmable_bootstrap_cg(
__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap_cg(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count) {
int max_shared_memory = cuda_get_max_shared_memory(0);
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
uint64_t partial_sm =
@@ -330,7 +343,8 @@ template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t num_samples);
uint32_t num_samples,
uint32_t max_shared_memory);
template <typename Torus>
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
@@ -339,7 +353,8 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
template <typename Torus>
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
@@ -348,7 +363,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
#if (CUDA_ARCH >= 900)
template <typename Torus>
@@ -358,44 +374,43 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
template <typename Torus>
template <typename Torus, typename STorus>
void scratch_cuda_programmable_bootstrap_tbc(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
#endif
template <typename Torus>
template <typename Torus, typename STorus>
void scratch_cuda_programmable_bootstrap_cg(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
template <typename Torus>
template <typename Torus, typename STorus>
void scratch_cuda_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count);
uint32_t level_count,
uint32_t max_shared_memory);
#ifdef __CUDACC__
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template <typename T>
__device__ const T *get_ith_mask_kth_block(const T *ptr, int i, int k,
int level, uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template <typename T>
__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
@@ -407,8 +422,8 @@ __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
int glwe_dimension, uint32_t level_count);
template <typename T>
__device__ const T *get_multi_bit_ith_lwe_gth_group_kth_block(
const T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
#endif

View File

@@ -8,7 +8,7 @@ extern "C" {
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t num_samples);
uint32_t num_samples, uint32_t max_shared_memory);
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void *src,
@@ -19,7 +19,8 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t chunk_size = 0);
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -27,7 +28,9 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset,
uint32_t lwe_chunk_size = 0);
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
uint32_t gpu_index,
@@ -35,21 +38,23 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
}
template <typename Torus>
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
uint32_t polynomial_size);
__host__ bool
supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
uint32_t polynomial_size, uint32_t max_shared_memory);
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count);
uint32_t level_count, uint32_t max_shared_memory);
#if CUDA_ARCH >= 900
template <typename Torus>
template <typename Torus, typename STorus>
void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
template <typename Torus>
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
@@ -58,14 +63,25 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t gpu_offset, uint32_t lwe_chunk_size);
#endif
template <typename Torus>
template <typename Torus, typename STorus>
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
template <typename Torus, typename STorus>
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
template <typename Torus>
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
@@ -74,14 +90,17 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t gpu_offset, uint32_t lwe_chunk_size = 0);
template <typename Torus>
template <typename Torus, typename STorus>
void scratch_cuda_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
template <typename Torus>
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
@@ -90,34 +109,45 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t gpu_offset, uint32_t lwe_chunk_size = 0);
template <typename Torus>
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
uint32_t polynomial_size);
template <typename Torus>
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
uint32_t polynomial_size);
template <typename Torus>
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
uint32_t polynomial_size);
template <typename Torus>
uint64_t get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
__host__ __device__ uint64_t
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
uint32_t polynomial_size);
template <typename Torus>
uint64_t get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
__host__ __device__ uint64_t
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
uint64_t get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
__host__ __device__ uint64_t
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
uint64_t get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
__host__ __device__ uint64_t
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
uint64_t get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
__host__ __device__ uint64_t
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
__host__ __device__ uint64_t
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
@@ -287,7 +317,8 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
};
template <typename Torus, class params>
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size);
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size,
uint32_t max_shared_memory);
#endif // CUDA_MULTI_BIT_H

View File

@@ -1,3 +1,17 @@
set(SOURCES
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
file(GLOB_RECURSE SOURCES "*.cu")
add_library(tfhe_cuda_backend STATIC ${SOURCES})
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)

View File

@@ -1,5 +1,4 @@
#include "ciphertext.cuh"
#include "polynomial/parameters.cuh"
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
uint32_t gpu_index,
@@ -20,58 +19,3 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)dest,
(uint64_t *)src, number_of_cts, lwe_dimension);
}
void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
void *lwe_array_out, void *glwe_array_in,
uint32_t *nth_array, uint32_t num_glwes,
uint32_t glwe_dimension,
uint32_t polynomial_size) {
switch (polynomial_size) {
case 256:
host_sample_extract<uint64_t, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
glwe_dimension);
break;
case 512:
host_sample_extract<uint64_t, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
glwe_dimension);
break;
case 1024:
host_sample_extract<uint64_t, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
glwe_dimension);
break;
case 2048:
host_sample_extract<uint64_t, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
glwe_dimension);
break;
case 4096:
host_sample_extract<uint64_t, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
glwe_dimension);
break;
case 8192:
host_sample_extract<uint64_t, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
glwe_dimension);
break;
case 16384:
host_sample_extract<uint64_t, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
glwe_dimension);
break;
default:
PANIC("Cuda error: unsupported polynomial size. Supported "
"N's are powers of two in the interval [256..16384].")
}
}

View File

@@ -3,7 +3,6 @@
#include "ciphertext.h"
#include "device.h"
#include "polynomial/functions.cuh"
#include <cstdint>
template <typename T>
@@ -26,39 +25,4 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
}
template <typename Torus, class params>
__global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
uint32_t *nth_array, uint32_t glwe_dimension) {
const int input_id = blockIdx.x;
const int glwe_input_size = (glwe_dimension + 1) * params::degree;
const int lwe_output_size = glwe_dimension * params::degree + 1;
auto lwe_out = lwe_array_out + input_id * lwe_output_size;
// We assume each GLWE will store the first polynomial_size inputs
uint32_t lwe_per_glwe = params::degree;
auto glwe_in = glwe_array_in + (input_id / lwe_per_glwe) * glwe_input_size;
auto nth = nth_array[input_id];
sample_extract_mask<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
sample_extract_body<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
}
template <typename Torus, class params>
__host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
Torus *lwe_array_out, Torus *glwe_array_in,
uint32_t *nth_array, uint32_t num_nths,
uint32_t glwe_dimension) {
cudaSetDevice(gpu_index);
dim3 grid(num_nths);
dim3 thds(params::degree / params::opt);
sample_extract<Torus, params><<<grid, thds, 0, stream>>>(
lwe_array_out, glwe_array_in, nth_array, glwe_dimension);
check_cuda_error(cudaGetLastError());
}
#endif

View File

@@ -9,14 +9,16 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
host_keyswitch_lwe_ciphertext_vector(
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t gpu_offset) {
cuda_keyswitch_lwe_ciphertext_vector(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes), static_cast<uint32_t *>(ksk),
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples,
gpu_offset);
}
/* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
@@ -39,44 +41,14 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
host_keyswitch_lwe_ciphertext_vector(
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t gpu_offset) {
cuda_keyswitch_lwe_ciphertext_vector(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
}
void scratch_packing_keyswitch_lwe_list_to_glwe_64(
void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
bool allocate_gpu_memory) {
scratch_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index, fp_ks_buffer,
glwe_dimension, polynomial_size, num_lwes, allocate_gpu_memory);
}
/* Perform functional packing keyswitch on a batch of 64 bits input LWE
* ciphertexts.
*/
void cuda_packing_keyswitch_lwe_list_to_glwe_64(
void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {
host_packing_keyswitch_lwe_list_to_glwe(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(glwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(fp_ksk_array), fp_ks_buffer, input_lwe_dimension,
output_glwe_dimension, output_polynomial_size, base_log, level_count,
num_lwes);
}
void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
uint32_t gpu_index,
int8_t **fp_ks_buffer) {
cuda_drop_async(*fp_ks_buffer, static_cast<cudaStream_t>(stream), gpu_index);
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples,
gpu_offset);
}

View File

@@ -7,7 +7,6 @@
#include "polynomial/functions.cuh"
#include "polynomial/polynomial_math.cuh"
#include "torus.cuh"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <thread>
#include <vector>
@@ -39,25 +38,26 @@ __device__ Torus *get_ith_block(Torus *ksk, int i, int level,
// threads in y are used to paralelize the lwe_dimension_in loop.
// shared memory is used to store intermediate results of the reduction.
template <typename Torus>
__global__ void
keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const Torus *__restrict__ ksk, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count) {
__global__ void keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes,
Torus *ksk, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, int gpu_offset) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int shmem_index = threadIdx.x + threadIdx.y * blockDim.x;
extern __shared__ int8_t sharedmem[];
Torus *lwe_acc_out = (Torus *)sharedmem;
auto block_lwe_array_out = get_chunk(
lwe_array_out, lwe_output_indexes[blockIdx.y], lwe_dimension_out + 1);
auto block_lwe_array_out =
get_chunk(lwe_array_out, lwe_output_indexes[blockIdx.y + gpu_offset],
lwe_dimension_out + 1);
if (tid <= lwe_dimension_out) {
Torus local_lwe_out = 0;
auto block_lwe_array_in = get_chunk(
lwe_array_in, lwe_input_indexes[blockIdx.y], lwe_dimension_in + 1);
auto block_lwe_array_in =
get_chunk(lwe_array_in, lwe_input_indexes[blockIdx.y + gpu_offset],
lwe_dimension_in + 1);
if (tid == lwe_dimension_out && threadIdx.y == 0) {
local_lwe_out = block_lwe_array_in[lwe_dimension_in];
@@ -99,11 +99,12 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
}
template <typename Torus>
__host__ void host_keyswitch_lwe_ciphertext_vector(
__host__ void cuda_keyswitch_lwe_ciphertext_vector(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t gpu_offset = 0) {
cudaSetDevice(gpu_index);
@@ -119,190 +120,42 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
keyswitch<Torus><<<grid, threads, shared_mem, stream>>>(
lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
lwe_dimension_in, lwe_dimension_out, base_log, level_count);
lwe_dimension_in, lwe_dimension_out, base_log, level_count, gpu_offset);
check_cuda_error(cudaGetLastError());
}
template <typename Torus>
void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count,
const LweArrayVariant<Torus> &lwe_array_out,
const LweArrayVariant<Torus> &lwe_output_indexes,
const LweArrayVariant<Torus> &lwe_array_in,
const LweArrayVariant<Torus> &lwe_input_indexes,
Torus **ksks, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t base_log,
uint32_t level_count, uint32_t num_samples) {
void execute_keyswitch(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, Torus **ksks,
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count,
uint32_t num_samples, bool sync_streams = true) {
/// If the number of radix blocks is lower than the number of GPUs, not all
/// GPUs will be active and there will be 1 input per GPU
for (uint i = 0; i < gpu_count; i++) {
auto active_gpu_count = get_active_gpu_count(num_samples, gpu_count);
int num_samples_on_gpu_0 = get_num_inputs_on_gpu(num_samples, 0, gpu_count);
if (sync_streams)
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
#pragma omp parallel for num_threads(active_gpu_count)
for (uint i = 0; i < active_gpu_count; i++) {
int num_samples_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
Torus *current_lwe_output_indexes =
GET_VARIANT_ELEMENT(lwe_output_indexes, i);
Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
Torus *current_lwe_input_indexes =
GET_VARIANT_ELEMENT(lwe_input_indexes, i);
int gpu_offset = get_gpu_offset(num_samples, i, gpu_count);
// Compute Keyswitch
host_keyswitch_lwe_ciphertext_vector<Torus>(
streams[i], gpu_indexes[i], current_lwe_array_out,
current_lwe_output_indexes, current_lwe_array_in,
current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
base_log, level_count, num_samples_on_gpu);
cuda_keyswitch_lwe_ciphertext_vector<Torus>(
streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
lwe_array_in, lwe_input_indexes, ksks[i], lwe_dimension_in,
lwe_dimension_out, base_log, level_count, num_samples_on_gpu,
gpu_offset);
}
}
template <typename Torus>
__host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
if (allocate_gpu_memory)
*fp_ks_buffer = (int8_t *)cuda_malloc_async(
2 * num_lwes * glwe_accumulator_size * sizeof(Torus), stream,
gpu_index);
}
// public functional packing keyswitch for a single LWE ciphertext
//
// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
// different thread blocks at the x-axis to work on that input.
template <typename Torus>
__device__ void packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
Torus *glwe_out, Torus *lwe_in, Torus *fp_ksk, uint32_t lwe_dimension_in,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
size_t glwe_size = (glwe_dimension + 1);
if (tid < glwe_size * polynomial_size) {
const int local_index = threadIdx.x;
// the output_glwe is split in polynomials and each x-block takes one of
// them
size_t poly_id = blockIdx.x;
size_t coef_per_block = blockDim.x;
// number of coefficients inside fp-ksk block for each lwe_input coefficient
size_t ksk_block_size = glwe_size * polynomial_size * level_count;
// initialize accumulator to 0
glwe_out[tid] = SEL(0, lwe_in[lwe_dimension_in],
tid == glwe_dimension * polynomial_size);
// Iterate through all lwe elements
for (int i = 0; i < lwe_dimension_in; i++) {
// Round and prepare decomposition
Torus a_i = round_to_closest_multiple(lwe_in[i], base_log, level_count);
Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
Torus mod_b_mask = (1ll << base_log) - 1ll;
// block of key for current lwe coefficient (cur_input_lwe[i])
auto ksk_block = &fp_ksk[i * ksk_block_size];
for (int j = 0; j < level_count; j++) {
auto ksk_glwe = &ksk_block[j * glwe_size * polynomial_size];
// Iterate through each level and multiply by the ksk piece
auto ksk_glwe_chunk = &ksk_glwe[poly_id * coef_per_block];
Torus decomposed = decompose_one<Torus>(state, mod_b_mask, base_log);
glwe_out[tid] -= decomposed * ksk_glwe_chunk[local_index];
}
if (sync_streams)
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
}
// public functional packing keyswitch for a batch of LWE ciphertexts
//
// Selects the input each thread is working on using the y-block index.
//
// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
// different thread blocks at the x-axis to work on that input.
template <typename Torus>
__global__ void
packing_keyswitch_lwe_list_to_glwe(Torus *glwe_array_out, Torus *lwe_array_in,
Torus *fp_ksk, uint32_t lwe_dimension_in,
uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, Torus *d_mem) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
const int lwe_size = (lwe_dimension_in + 1);
const int input_id = blockIdx.y;
const int degree = input_id;
// Select an input
auto lwe_in = lwe_array_in + input_id * lwe_size;
auto ks_glwe_out = d_mem + input_id * glwe_accumulator_size;
auto glwe_out = glwe_array_out + input_id * glwe_accumulator_size;
// KS LWE to GLWE
packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
ks_glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension,
polynomial_size, base_log, level_count);
// P * x ^degree
auto in_poly = ks_glwe_out + (tid / polynomial_size) * polynomial_size;
auto out_result = glwe_out + (tid / polynomial_size) * polynomial_size;
polynomial_accumulate_monic_monomial_mul(out_result, in_poly, degree,
tid % polynomial_size,
polynomial_size, 1, true);
}
/// To-do: Rewrite this kernel for efficiency
template <typename Torus>
__global__ void accumulate_glwes(Torus *glwe_out, Torus *glwe_array_in,
uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t num_lwes) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < (glwe_dimension + 1) * polynomial_size) {
glwe_out[tid] = glwe_array_in[tid];
// Accumulate
for (int i = 1; i < num_lwes; i++) {
auto glwe_in = glwe_array_in + i * (glwe_dimension + 1) * polynomial_size;
glwe_out[tid] += glwe_in[tid];
}
}
}
template <typename Torus>
__host__ void host_packing_keyswitch_lwe_list_to_glwe(
cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out,
Torus *lwe_array_in, Torus *fp_ksk_array, int8_t *fp_ks_buffer,
uint32_t lwe_dimension_in, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_lwes) {
cudaSetDevice(gpu_index);
int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
int num_blocks = 0, num_threads = 0;
getNumBlocksAndThreads(glwe_accumulator_size, 128, num_blocks, num_threads);
dim3 grid(num_blocks, num_lwes);
dim3 threads(num_threads);
auto d_mem = (Torus *)fp_ks_buffer;
auto d_tmp_glwe_array_out = d_mem + num_lwes * glwe_accumulator_size;
// individually keyswitch each lwe
packing_keyswitch_lwe_list_to_glwe<<<grid, threads, 0, stream>>>(
d_tmp_glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in,
glwe_dimension, polynomial_size, base_log, level_count, d_mem);
check_cuda_error(cudaGetLastError());
// accumulate to a single glwe
accumulate_glwes<<<num_blocks, threads, 0, stream>>>(
glwe_out, d_tmp_glwe_array_out, glwe_dimension, polynomial_size,
num_lwes);
check_cuda_error(cudaGetLastError());
}
#endif

View File

@@ -2,7 +2,6 @@
#define CNCRT_TORUS_CUH
#include "types/int128.cuh"
#include "utils/kernel_dimensions.cuh"
#include <limits>
template <typename T>
@@ -19,66 +18,57 @@ __device__ inline void typecast_double_to_torus<uint32_t>(double x,
template <>
__device__ inline void typecast_double_to_torus<uint64_t>(double x,
uint64_t &r) {
// The ull intrinsic does not behave in the same way on all architectures and
// on some platforms this causes the cmux tree test to fail
// Hence the intrinsic is not used here
uint128 nnnn = make_uint128_from_float(x);
uint64_t lll = nnnn.lo_;
r = lll;
}
template <typename T>
__device__ inline void typecast_double_round_to_torus(double x, T &r) {
double mx = (sizeof(T) == 4) ? 4294967296.0 : 18446744073709551616.0;
double frac = x - floor(x);
frac *= mx;
typecast_double_to_torus(frac, r);
}
template <typename T>
__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
uint32_t level_count) {
const T non_rep_bit_count = sizeof(T) * 8 - level_count * base_log;
const T shift = non_rep_bit_count - 1;
T shift = sizeof(T) * 8 - level_count * base_log;
T mask = 1ll << (shift - 1);
T b = (x & mask) >> (shift - 1);
T res = x >> shift;
res += 1;
res &= (T)(-2);
return res << shift;
res += b;
res <<= shift;
return res;
}
template <typename T>
__device__ __forceinline__ void modulus_switch(T input, T &output,
uint32_t log_modulus) {
constexpr uint32_t BITS = sizeof(T) * 8;
output = input + (((T)1) << (BITS - log_modulus - 1));
output >>= (BITS - log_modulus);
__device__ __forceinline__ void rescale_torus_element(T element, T &output,
uint32_t log_shift) {
output =
round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
(double)log_shift);
}
template <typename T>
__device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) {
T output;
modulus_switch(input, output, log_modulus);
return output;
__device__ __forceinline__ T rescale_torus_element(T element,
uint32_t log_shift) {
return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
(double)log_shift);
}
template <typename Torus>
__global__ void modulus_switch_inplace(Torus *array, int size,
uint32_t log_modulus) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < size) {
array[tid] = modulus_switch(array[tid], log_modulus);
}
template <>
__device__ __forceinline__ void
rescale_torus_element<uint32_t>(uint32_t element, uint32_t &output,
uint32_t log_shift) {
output =
round(__uint2double_rn(element) /
(__uint2double_rn(std::numeric_limits<uint32_t>::max()) + 1.0) *
__uint2double_rn(log_shift));
}
template <typename Torus>
__host__ void host_modulus_switch_inplace(cudaStream_t stream,
uint32_t gpu_index, Torus *array,
int size, uint32_t log_modulus) {
cudaSetDevice(gpu_index);
int num_threads = 0, num_blocks = 0;
getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
modulus_switch_inplace<<<num_blocks, num_threads, 0, stream>>>(array, size,
log_modulus);
check_cuda_error(cudaGetLastError());
template <>
__device__ __forceinline__ void
rescale_torus_element<uint64_t>(uint64_t element, uint64_t &output,
uint32_t log_shift) {
output = round(__ull2double_rn(element) /
(__ull2double_rn(std::numeric_limits<uint64_t>::max()) + 1.0) *
__uint2double_rn(log_shift));
}
#endif // CNCRT_TORUS_H

View File

@@ -166,21 +166,19 @@ __global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
template <typename Torus>
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
Torus *d_array, Torus value, Torus n) {
if (n > 0) {
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, d_array));
if (attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid dest device pointer in cuda set value.")
}
check_cuda_error(cudaSetDevice(gpu_index));
int block_size = 256;
int num_blocks = (n + block_size - 1) / block_size;
// Launch the kernel
cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
n);
check_cuda_error(cudaGetLastError());
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, d_array));
if (attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid dest device pointer in cuda set value.")
}
check_cuda_error(cudaSetDevice(gpu_index));
int block_size = 256;
int num_blocks = (n + block_size - 1) / block_size;
// Launch the kernel
cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
n);
check_cuda_error(cudaGetLastError());
}
/// Explicitly instantiate cuda_set_value_async for 32 and 64 bits
@@ -243,18 +241,22 @@ void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
/// Get the maximum size for the shared memory
int cuda_get_max_shared_memory(uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
int max_shared_memory = 0;
cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
gpu_index);
check_cuda_error(cudaGetLastError());
#if CUDA_ARCH == 900
max_shared_memory = 226000;
#elif CUDA_ARCH == 890
max_shared_memory = 127000;
#elif CUDA_ARCH == 800
max_shared_memory = 163000;
#elif CUDA_ARCH == 700
max_shared_memory = 95000;
#endif
return max_shared_memory;
}
void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
cudaStreamCallback_t callback, void *user_data) {
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaStreamAddCallback(stream, callback, user_data, 0));
}
void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
void *host_pointer) {
free(host_pointer);
}

View File

@@ -6,360 +6,6 @@
#include "twiddles.cuh"
#include "types/complex/operations.cuh"
using Index = unsigned;
constexpr inline Index warpsize() { return 32; }
template <Index SIZE> __device__ inline void sync_fft_step() {
if constexpr (SIZE < warpsize()) {
__syncwarp();
} else {
__syncthreads();
}
}
__device__ constexpr bool coalesce_mem_access() { return false; }
__device__ inline void direct_butterfly(double2 *a0, double2 *a1, double2 w) {
double2 b0 = *a0;
double2 b1 = *a1 * w;
*a0 = b0 + b1;
*a1 = b0 - b1;
}
__device__ inline void inverse_butterfly(double2 *a0, double2 *a1, double2 w) {
double2 b0 = *a0 + *a1;
double2 b1 = *a0 - *a1;
*a0 = b0;
*a1 = mul_conj(b1, w);
}
template <Index M, Index T>
__device__ inline void NSMFFT_direct_2warpsize(double2 *a0, double2 *a1,
Index const tid) {
if constexpr (T > 0) {
if constexpr (T < warpsize()) {
// thread 0:
// a0: 0 * T
// a1: 2 * T
// thread 1:
// a0: 1 * T
// a1: 3 * T
bool is_lower = ((tid >> tfhe_log2(T)) & 1) == 0;
double2 tmp = is_lower ? *a1 : *a0;
// thread 0:
// tmp: 2 * T
// thread 1:
// tmp: 1 * T
tmp.x = __shfl_xor_sync(0xFFFFFFFF, tmp.x, T, 2 * T);
tmp.y = __shfl_xor_sync(0xFFFFFFFF, tmp.y, T, 2 * T);
// thread 0:
// tmp: 1 * T
// thread 1:
// tmp: 2 * T
*a0 = is_lower ? *a0 : tmp;
*a1 = is_lower ? tmp : *a1;
// thread 0:
// a0: 0 * T
// a1: 1 * T
// thread 1:
// a0: 2 * T
// a1: 3 * T
}
double2 w1 = negtwiddles[M + (tid / T)];
direct_butterfly(a0, a1, w1);
NSMFFT_direct_2warpsize<M * 2, T / 2>(a0, a1, tid);
}
}
template <Index M, Index T>
__device__ inline void NSMFFT_inverse_2warpsize(double2 *a0, double2 *a1,
Index const tid) {
if constexpr (T > 0) {
NSMFFT_inverse_2warpsize<M * 2, T / 2>(a0, a1, tid);
double2 w1 = negtwiddles[M + (tid / T)];
inverse_butterfly(a0, a1, w1);
if constexpr (T < warpsize()) {
// thread 0:
// a0: 0 * T
// a1: 1 * T
// thread 1:
// a0: 2 * T
// a1: 3 * T
bool is_lower = ((tid >> tfhe_log2(T)) & 1) == 0;
double2 tmp = is_lower ? *a1 : *a0;
// thread 0:
// tmp: 1 * T
// thread 1:
// tmp: 2 * T
tmp.x = __shfl_xor_sync(0xFFFFFFFF, tmp.x, T, 2 * T);
tmp.y = __shfl_xor_sync(0xFFFFFFFF, tmp.y, T, 2 * T);
// thread 0:
// tmp: 2 * T
// thread 1:
// tmp: 1 * T
*a0 = is_lower ? *a0 : tmp;
*a1 = is_lower ? tmp : *a1;
// thread 0:
// a0: 0 * T
// a1: 2 * T
// thread 1:
// a0: 1 * T
// a1: 3 * T
}
}
}
template <Index RADIX, Index DEGREE, Index M>
__device__ inline void NSMFFT_direct_step(double2 *A, Index const tid) {
constexpr Index SIZE = DEGREE / M;
constexpr Index T = SIZE / RADIX;
Index i;
double2 w1;
if constexpr (M == 1) {
i = 0;
w1 = (double2){
0.707106781186547461715008466854,
0.707106781186547461715008466854,
};
} else {
i = tid / T;
w1 = negtwiddles[M + i];
}
Index j = i * SIZE + (tid % T);
if constexpr (RADIX == 4) {
double2 w2 = negtwiddles[2 * (M + i) + 0];
double2 w3 = negtwiddles[2 * (M + i) + 1];
Index j0 = j + 0 * T;
Index j1 = j + 1 * T;
Index j2 = j + 2 * T;
Index j3 = j + 3 * T;
double2 a0 = A[j0];
double2 a1 = A[j1];
double2 a2 = A[j2];
double2 a3 = A[j3];
direct_butterfly(&a0, &a2, w1);
direct_butterfly(&a1, &a3, w1);
direct_butterfly(&a0, &a1, w2);
direct_butterfly(&a2, &a3, w3);
A[j0] = a0;
A[j1] = a1;
A[j2] = a2;
A[j3] = a3;
} else if constexpr (RADIX == 2) {
Index j0 = j + 0 * T;
Index j1 = j + 1 * T;
double2 a0 = A[j0];
double2 a1 = A[j1];
direct_butterfly(&a0, &a1, w1);
A[j0] = a0;
A[j1] = a1;
}
}
template <Index RADIX, Index DEGREE, Index M>
__device__ inline void NSMFFT_inverse_step(double2 *A, Index const tid) {
constexpr Index SIZE = DEGREE / M;
constexpr Index T = SIZE / RADIX;
Index i;
double2 w1;
if constexpr (M == 1) {
i = 0;
w1 = (double2){
0.707106781186547461715008466854,
0.707106781186547461715008466854,
};
} else {
i = tid / T;
w1 = negtwiddles[M + i];
}
Index j = i * SIZE + (tid % T);
if constexpr (RADIX == 4) {
double2 w2 = negtwiddles[2 * (M + i) + 0];
double2 w3 = negtwiddles[2 * (M + i) + 1];
Index j0 = j + 0 * T;
Index j1 = j + 1 * T;
Index j2 = j + 2 * T;
Index j3 = j + 3 * T;
double2 a0 = A[j0];
double2 a1 = A[j1];
double2 a2 = A[j2];
double2 a3 = A[j3];
if constexpr (M == 1) {
a0 *= 1.0 / double(DEGREE);
a1 *= 1.0 / double(DEGREE);
a2 *= 1.0 / double(DEGREE);
a3 *= 1.0 / double(DEGREE);
}
inverse_butterfly(&a0, &a1, w2);
inverse_butterfly(&a2, &a3, w3);
inverse_butterfly(&a0, &a2, w1);
inverse_butterfly(&a1, &a3, w1);
A[j0] = a0;
A[j1] = a1;
A[j2] = a2;
A[j3] = a3;
} else if constexpr (RADIX == 2) {
Index j0 = j + 0 * T;
Index j1 = j + 1 * T;
double2 a0 = A[j0];
double2 a1 = A[j1];
if constexpr (M == 1) {
a0 *= 1.0 / double(DEGREE);
a1 *= 1.0 / double(DEGREE);
}
inverse_butterfly(&a0, &a1, w1);
A[j0] = a0;
A[j1] = a1;
}
}
template <Index RADIX, Index DEGREE, Index OPT, Index M>
__device__ inline void NSMFFT_direct_impl(double2 *A, Index const tid) {
static_assert(OPT >= RADIX,
"params::opt should be larger than or equal to the fft radix");
constexpr Index SIZE = DEGREE / M;
if constexpr (coalesce_mem_access()) {
constexpr Index WARPSIZE = warpsize();
if constexpr (SIZE >= 2 * WARPSIZE) {
constexpr Index radix = (SIZE == 2 * WARPSIZE) ? 2
: ((SIZE / (2 * WARPSIZE)) % RADIX != 0) ? 2
: RADIX;
__syncthreads();
#pragma unroll
for (Index k = 0; k < OPT / radix; ++k) {
if constexpr (SIZE == 2 * WARPSIZE) {
static_assert(radix == 2, "");
Index i = tid / WARPSIZE;
Index j = i * SIZE + (tid % WARPSIZE);
Index j0 = j + 0 * WARPSIZE;
Index j1 = j + 1 * WARPSIZE;
double2 a0 = A[j0];
double2 a1 = A[j1];
NSMFFT_direct_2warpsize<M, WARPSIZE>(&a0, &a1, tid);
A[j0] = a0;
A[j1] = a1;
} else {
static_assert(SIZE > 2 * WARPSIZE, "");
NSMFFT_direct_step<radix, DEGREE, M>(A, tid + k * (DEGREE / OPT));
}
}
NSMFFT_direct_impl<RADIX, DEGREE, OPT, radix * M>(A, tid);
}
} else {
if constexpr (SIZE > 1) {
constexpr Index radix = (SIZE % RADIX != 0) ? 2 : RADIX;
sync_fft_step<SIZE>();
#pragma unroll
for (Index k = 0; k < OPT / radix; ++k) {
NSMFFT_direct_step<radix, DEGREE, M>(A, tid + k * (DEGREE / OPT));
}
NSMFFT_direct_impl<RADIX, DEGREE, OPT, radix * M>(A, tid);
}
}
}
template <Index RADIX, Index DEGREE, Index OPT, Index M>
__device__ inline void NSMFFT_inverse_impl(double2 *A, Index const tid) {
static_assert(OPT >= RADIX,
"params::opt should be larger than or equal to the fft radix");
constexpr Index SIZE = DEGREE / M;
if constexpr (coalesce_mem_access()) {
constexpr Index WARPSIZE = warpsize();
if constexpr (SIZE >= 2 * WARPSIZE) {
constexpr Index radix = (SIZE == 2 * WARPSIZE) ? 2
: ((SIZE / (2 * WARPSIZE)) % RADIX != 0) ? 2
: RADIX;
NSMFFT_inverse_impl<RADIX, DEGREE, OPT, radix * M>(A, tid);
#pragma unroll
for (Index k = 0; k < OPT / radix; ++k) {
if constexpr (SIZE == 2 * WARPSIZE) {
static_assert(radix == 2, "");
Index i = tid / WARPSIZE;
Index j = i * SIZE + (tid % WARPSIZE);
Index j0 = j + 0 * WARPSIZE;
Index j1 = j + 1 * WARPSIZE;
double2 a0 = A[j0];
double2 a1 = A[j1];
NSMFFT_inverse_2warpsize<M, WARPSIZE>(&a0, &a1, tid);
A[j0] = a0;
A[j1] = a1;
} else {
static_assert(SIZE > 2 * WARPSIZE, "");
NSMFFT_inverse_step<radix, DEGREE, M>(A, tid + k * (DEGREE / OPT));
}
}
__syncthreads();
}
} else {
if constexpr (SIZE > 1) {
constexpr Index radix = (SIZE % RADIX != 0) ? 2 : RADIX;
NSMFFT_inverse_impl<RADIX, DEGREE, OPT, radix * M>(A, tid);
#pragma unroll
for (Index k = 0; k < OPT / radix; ++k) {
NSMFFT_inverse_step<radix, DEGREE, M>(A, tid + k * (DEGREE / OPT));
}
sync_fft_step<SIZE>();
}
}
}
/*
* Direct negacyclic FFT:
* - before the FFT the N real coefficients are stored into a
@@ -376,19 +22,600 @@ __device__ inline void NSMFFT_inverse_impl(double2 *A, Index const tid) {
* is replaced with:
* \zeta_j,k = exp(-i pi (2j-1)/2^k)
*/
template <class params, int radix = tfhe_fft_default_radix()>
__device__ void NSMFFT_direct(double2 *A) {
NSMFFT_direct_impl<radix, params::degree, params::opt, 1>(A, threadIdx.x);
template <class params> __device__ void NSMFFT_direct(double2 *A) {
/* We don't make bit reverse here, since twiddles are already reversed
* Each thread is always in charge of "opt/2" pairs of coefficients,
* which is why we always loop through N/2 by N/opt strides
* The pragma unroll instruction tells the compiler to unroll the
* full loop, which should increase performance
*/
size_t tid = threadIdx.x;
size_t twid_id;
size_t i1, i2;
double2 u, v, w;
// level 1
// we don't make actual complex multiplication on level1 since we have only
// one twiddle, it's real and image parts are equal, so we can multiply
// it with simpler operations
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
i1 = tid;
i2 = tid + params::degree / 2;
u = A[i1];
v = A[i2] * (double2){0.707106781186547461715008466854,
0.707106781186547461715008466854};
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// level 2
// from this level there are more than one twiddles and none of them has equal
// real and imag parts, so complete complex multiplication is needed
// for each level params::degree / 2^level represents number of coefficients
// inside divided chunk of specific level
//
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 4);
i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
i2 = i1 + params::degree / 4;
w = negtwiddles[twid_id + 2];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// level 3
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 8);
i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
i2 = i1 + params::degree / 8;
w = negtwiddles[twid_id + 4];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// level 4
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 16);
i1 =
2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
i2 = i1 + params::degree / 16;
w = negtwiddles[twid_id + 8];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// level 5
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 32);
i1 =
2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
i2 = i1 + params::degree / 32;
w = negtwiddles[twid_id + 16];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// level 6
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 64);
i1 =
2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
i2 = i1 + params::degree / 64;
w = negtwiddles[twid_id + 32];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// level 7
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 128);
i1 = 2 * (params::degree / 128) * twid_id +
(tid & (params::degree / 128 - 1));
i2 = i1 + params::degree / 128;
w = negtwiddles[twid_id + 64];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// from level 8, we need to check size of params degree, because we support
// minimum actual polynomial size = 256, when compressed size is halfed and
// minimum supported compressed size is 128, so we always need first 7
// levels of butterfly operation, since butterfly levels are hardcoded
// we need to check if polynomial size is big enough to require specific level
// of butterfly.
if constexpr (params::degree >= 256) {
// level 8
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 256);
i1 = 2 * (params::degree / 256) * twid_id +
(tid & (params::degree / 256 - 1));
i2 = i1 + params::degree / 256;
w = negtwiddles[twid_id + 128];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 512) {
// level 9
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 512);
i1 = 2 * (params::degree / 512) * twid_id +
(tid & (params::degree / 512 - 1));
i2 = i1 + params::degree / 512;
w = negtwiddles[twid_id + 256];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 1024) {
// level 10
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 1024);
i1 = 2 * (params::degree / 1024) * twid_id +
(tid & (params::degree / 1024 - 1));
i2 = i1 + params::degree / 1024;
w = negtwiddles[twid_id + 512];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 2048) {
// level 11
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 2048);
i1 = 2 * (params::degree / 2048) * twid_id +
(tid & (params::degree / 2048 - 1));
i2 = i1 + params::degree / 2048;
w = negtwiddles[twid_id + 1024];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 4096) {
// level 12
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 4096);
i1 = 2 * (params::degree / 4096) * twid_id +
(tid & (params::degree / 4096 - 1));
i2 = i1 + params::degree / 4096;
w = negtwiddles[twid_id + 2048];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
// compressed size = 8192 is actual polynomial size = 16384.
// from this size, twiddles can't fit in constant memory,
// so from here, butterfly operation access device memory.
if constexpr (params::degree >= 8192) {
// level 13
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 8192);
i1 = 2 * (params::degree / 8192) * twid_id +
(tid & (params::degree / 8192 - 1));
i2 = i1 + params::degree / 8192;
w = negtwiddles13[twid_id];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
}
/*
* negacyclic inverse fft
*/
template <class params, int radix = tfhe_fft_default_radix()>
__device__ void NSMFFT_inverse(double2 *A) {
template <class params> __device__ void NSMFFT_inverse(double2 *A) {
/* We don't make bit reverse here, since twiddles are already reversed
* Each thread is always in charge of "opt/2" pairs of coefficients,
* which is why we always loop through N/2 by N/opt strides
* The pragma unroll instruction tells the compiler to unroll the
* full loop, which should increase performance
*/
size_t tid = threadIdx.x;
size_t twid_id;
size_t i1, i2;
double2 u, w;
// divide input by compressed polynomial size
tid = threadIdx.x;
for (size_t i = 0; i < params::opt; ++i) {
A[tid] /= params::degree;
tid += params::degree / params::opt;
}
__syncthreads();
// none of the twiddles have equal real and imag part, so
// complete complex multiplication has to be done
// here we have more than one twiddle
// mapping in backward fft is reversed
// butterfly operation is started from last level
// compressed size = 8192 is actual polynomial size = 16384.
// twiddles for this size can't fit in constant memory so
// butterfly operation for this level access device memory to fetch
// twiddles
if constexpr (params::degree >= 8192) {
// level 13
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 8192);
i1 = 2 * (params::degree / 8192) * twid_id +
(tid & (params::degree / 8192 - 1));
i2 = i1 + params::degree / 8192;
w = negtwiddles13[twid_id];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 4096) {
// level 12
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 4096);
i1 = 2 * (params::degree / 4096) * twid_id +
(tid & (params::degree / 4096 - 1));
i2 = i1 + params::degree / 4096;
w = negtwiddles[twid_id + 2048];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 2048) {
// level 11
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 2048);
i1 = 2 * (params::degree / 2048) * twid_id +
(tid & (params::degree / 2048 - 1));
i2 = i1 + params::degree / 2048;
w = negtwiddles[twid_id + 1024];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 1024) {
// level 10
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 1024);
i1 = 2 * (params::degree / 1024) * twid_id +
(tid & (params::degree / 1024 - 1));
i2 = i1 + params::degree / 1024;
w = negtwiddles[twid_id + 512];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 512) {
// level 9
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 512);
i1 = 2 * (params::degree / 512) * twid_id +
(tid & (params::degree / 512 - 1));
i2 = i1 + params::degree / 512;
w = negtwiddles[twid_id + 256];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 256) {
// level 8
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 256);
i1 = 2 * (params::degree / 256) * twid_id +
(tid & (params::degree / 256 - 1));
i2 = i1 + params::degree / 256;
w = negtwiddles[twid_id + 128];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
}
// below level 8, we don't need to check size of params degree, because we
// support minimum actual polynomial size = 256, when compressed size is
// halfed and minimum supported compressed size is 128, so we always need
// last 7 levels of butterfly operation, since butterfly levels are hardcoded
// we don't need to check if polynomial size is big enough to require
// specific level of butterfly.
// level 7
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 128);
i1 = 2 * (params::degree / 128) * twid_id +
(tid & (params::degree / 128 - 1));
i2 = i1 + params::degree / 128;
w = negtwiddles[twid_id + 64];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
// level 6
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 64);
i1 =
2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
i2 = i1 + params::degree / 64;
w = negtwiddles[twid_id + 32];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
// level 5
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 32);
i1 =
2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
i2 = i1 + params::degree / 32;
w = negtwiddles[twid_id + 16];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
// level 4
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 16);
i1 =
2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
i2 = i1 + params::degree / 16;
w = negtwiddles[twid_id + 8];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
// level 3
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 8);
i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
i2 = i1 + params::degree / 8;
w = negtwiddles[twid_id + 4];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
// level 2
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 4);
i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
i2 = i1 + params::degree / 4;
w = negtwiddles[twid_id + 2];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
// level 1
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 2);
i1 = 2 * (params::degree / 2) * twid_id + (tid & (params::degree / 2 - 1));
i2 = i1 + params::degree / 2;
w = negtwiddles[twid_id + 1];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
NSMFFT_inverse_impl<radix, params::degree, params::opt, 1>(A, threadIdx.x);
}
/*

View File

@@ -1,6 +1,6 @@
#include "cuComplex.h"
__device__ double2 negtwiddles[8192] = {
__constant__ double2 negtwiddles[4096] = {
{0, 0},
{0.707106781186547461715008466854, 0.707106781186547572737310929369},
{0.92387953251128673848313610506, 0.382683432365089781779232680492},
@@ -4096,7 +4096,9 @@ __device__ double2 negtwiddles[8192] = {
{0.70791982920081630847874976098, 0.706292797233758484765075991163},
{-0.706292797233758484765075991163, 0.70791982920081630847874976098},
{0.00115048533711384847431913325266, 0.99999933819152553304832053982},
{-0.99999933819152553304832053982, 0.00115048533711384847431913325266},
{-0.99999933819152553304832053982, 0.00115048533711384847431913325266}};
__device__ double2 negtwiddles13[4096] = {
{0.999999981616429334252416083473, 0.000191747597310703291528452552051},
{-0.000191747597310703291528452552051, 0.999999981616429334252416083473},
{0.706971182161065359039753275283, 0.707242354213734603085583785287},

View File

@@ -2,7 +2,12 @@
#define GPU_BOOTSTRAP_TWIDDLES_CUH
/*
* 'negtwiddles' are stored in device memory to profit caching
* 'negtwiddles' are stored in constant memory for faster access times
* because of it's limited size, only twiddles for up to 2^12 polynomial size
* can be stored there, twiddles for 2^13 are stored in device memory
* 'negtwiddles13'
*/
extern __device__ double2 negtwiddles[8192];
extern __constant__ double2 negtwiddles[4096];
extern __device__ double2 negtwiddles13[4096];
#endif

View File

@@ -1,49 +0,0 @@
#include "integer/addition.cuh"
void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION
: SIGNED_OPERATION::SUBTRACTION;
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);
scratch_cuda_integer_signed_overflowing_add_or_sub_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_signed_overflowing_add_or_sub_memory<uint64_t> **)mem_ptr,
num_blocks, op, params, allocate_gpu_memory);
}
void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lhs,
void *rhs, void *overflowed, int8_t signed_operation, int8_t *mem_ptr,
void **bsks, void **ksks, uint32_t num_blocks) {
auto mem = (int_signed_overflowing_add_or_sub_memory<uint64_t> *)mem_ptr;
SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION
: SIGNED_OPERATION::SUBTRACTION;
host_integer_signed_overflowing_add_or_sub_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lhs), static_cast<uint64_t *>(rhs),
static_cast<uint64_t *>(overflowed), op, bsks, (uint64_t **)(ksks), mem,
num_blocks);
}
void cleanup_signed_overflowing_add_or_sub(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr =
(int_signed_overflowing_add_or_sub_memory<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}

View File

@@ -1,137 +0,0 @@
#ifndef TFHE_RS_ADDITION_CUH
#define TFHE_RS_ADDITION_CUH
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.h"
#include "integer/comparison.cuh"
#include "integer/integer.cuh"
#include "integer/negation.cuh"
#include "integer/scalar_shifts.cuh"
#include "linear_algebra.h"
#include "programmable_bootstrap.h"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
template <typename Torus>
void host_resolve_signed_overflow(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *result, Torus *last_block_inner_propagation,
Torus *last_block_input_carry, Torus *last_block_output_carry,
int_resolve_signed_overflow_memory<Torus> *mem, void **bsks, Torus **ksks) {
auto x = mem->x;
Torus *d_clears =
(Torus *)cuda_malloc_async(sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_set_value_async<Torus>(streams[0], gpu_indexes[0], d_clears, 2, 1);
// replace with host function call
cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
streams[0], gpu_indexes[0], x, last_block_output_carry, d_clears,
mem->params.big_lwe_dimension, 1);
host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
last_block_inner_propagation, x, mem->params.big_lwe_dimension,
1);
host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
last_block_inner_propagation, last_block_input_carry,
mem->params.big_lwe_dimension, 1);
host_apply_univariate_lut_kb<Torus>(streams, gpu_indexes, gpu_count, result,
last_block_inner_propagation,
mem->resolve_overflow_lut, ksks, bsks, 1);
cuda_drop_async(d_clears, streams[0], gpu_indexes[0]);
}
template <typename Torus>
__host__ void scratch_cuda_integer_signed_overflowing_add_or_sub_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_signed_overflowing_add_or_sub_memory<Torus> **mem_ptr,
uint32_t num_blocks, SIGNED_OPERATION op, int_radix_params params,
bool allocate_gpu_memory) {
*mem_ptr = new int_signed_overflowing_add_or_sub_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks, op,
allocate_gpu_memory);
}
/*
* Addition - signed_operation = 1
* Subtraction - signed_operation = -1
*/
template <typename Torus>
__host__ void host_integer_signed_overflowing_add_or_sub_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lhs, Torus *rhs, Torus *overflowed, SIGNED_OPERATION op, void **bsks,
uint64_t **ksks,
int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr,
uint32_t num_blocks) {
auto radix_params = mem_ptr->params;
uint32_t big_lwe_dimension = radix_params.big_lwe_dimension;
uint32_t big_lwe_size = big_lwe_dimension + 1;
uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
assert(radix_params.message_modulus >= 4 && radix_params.carry_modulus >= 4);
auto result = mem_ptr->result;
auto neg_rhs = mem_ptr->neg_rhs;
auto input_carries = mem_ptr->input_carries;
auto output_carry = mem_ptr->output_carry;
auto last_block_inner_propagation = mem_ptr->last_block_inner_propagation;
cuda_memcpy_async_gpu_to_gpu(result, lhs, num_blocks * big_lwe_size_bytes,
streams[0], gpu_indexes[0]);
// phase 1
if (op == SIGNED_OPERATION::ADDITION) {
host_addition(streams[0], gpu_indexes[0], result, lhs, rhs,
big_lwe_dimension, num_blocks);
} else {
host_integer_radix_negation(
streams, gpu_indexes, gpu_count, neg_rhs, rhs, big_lwe_dimension,
num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
host_addition(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
big_lwe_dimension, num_blocks);
}
// phase 2
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
host_propagate_single_carry(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
result, output_carry, input_carries,
mem_ptr->scp_mem, bsks, ksks, num_blocks);
host_generate_last_block_inner_propagation(
mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size],
&rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, bsks,
ksks);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
}
// phase 3
auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size];
host_resolve_signed_overflow(
streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation,
input_carry, output_carry, mem_ptr->resolve_overflow_mem, bsks, ksks);
cuda_memcpy_async_gpu_to_gpu(lhs, result, num_blocks * big_lwe_size_bytes,
streams[0], gpu_indexes[0]);
}
#endif // TFHE_RS_ADDITION_CUH

View File

@@ -34,6 +34,19 @@ void cuda_bitop_integer_radix_ciphertext_kb_64(
lwe_ciphertext_count);
}
void cuda_bitnot_integer_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_in, int8_t *mem_ptr, void **bsks,
void **ksks, uint32_t lwe_ciphertext_count) {
host_integer_radix_bitnot_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
lwe_ciphertext_count);
}
void cleanup_cuda_integer_bitop(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void) {

View File

@@ -26,6 +26,19 @@ host_integer_radix_bitop_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
bsks, ksks, num_radix_blocks, lut, lut->params.message_modulus);
}
template <typename Torus>
__host__ void host_integer_radix_bitnot_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, int_bitop_buffer<Torus> *mem_ptr,
void **bsks, Torus **ksks, uint32_t num_radix_blocks) {
auto lut = mem_ptr->lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks, ksks,
num_radix_blocks, lut);
}
template <typename Torus>
__host__ void scratch_cuda_integer_radix_bitop_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,

View File

@@ -2,6 +2,7 @@
#define CUDA_INTEGER_CMUX_CUH
#include "integer.cuh"
#include <omp.h>
template <typename Torus>
__host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
@@ -56,18 +57,27 @@ __host__ void host_integer_radix_cmux_kb(
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
auto mem_true = mem_ptr->zero_if_true_buffer;
zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
lwe_array_true, lwe_condition, mem_true,
mem_ptr->inverted_predicate_lut, bsks, ksks, num_radix_blocks);
auto mem_false = mem_ptr->zero_if_false_buffer;
zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
lwe_array_false, lwe_condition, mem_false, mem_ptr->predicate_lut,
bsks, ksks, num_radix_blocks);
for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
#pragma omp parallel sections
{
// Both sections may be executed in parallel
#pragma omp section
{
auto mem_true = mem_ptr->zero_if_true_buffer;
zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
lwe_array_true, lwe_condition, mem_true,
mem_ptr->inverted_predicate_lut, bsks, ksks,
num_radix_blocks);
}
#pragma omp section
{
auto mem_false = mem_ptr->zero_if_false_buffer;
zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
lwe_array_false, lwe_condition, mem_false,
mem_ptr->predicate_lut, bsks, ksks, num_radix_blocks);
}
}
for (uint j = 0; j < mem_ptr->zero_if_false_buffer->active_gpu_count; j++) {
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
cuda_synchronize_stream(false_streams[j], gpu_indexes[j]);
}

View File

@@ -245,6 +245,7 @@ __host__ void host_compare_with_zero_equality(
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
cudaSetDevice(gpu_indexes[0]);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;

View File

@@ -1,87 +0,0 @@
#include "compression.cuh"
void scratch_cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
bool allocate_gpu_memory) {
int_radix_params compression_params(
pbs_type, compression_glwe_dimension, compression_polynomial_size,
(compression_glwe_dimension + 1) * compression_polynomial_size,
lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
carry_modulus);
scratch_cuda_compress_integer_radix_ciphertext_64(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_compression<uint64_t> **)mem_ptr, num_lwes, compression_params,
lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
}
void scratch_cuda_integer_decompress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory) {
int_radix_params encryption_params(
pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
(encryption_glwe_dimension + 1) * encryption_polynomial_size,
lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
carry_modulus);
int_radix_params compression_params(
pbs_type, compression_glwe_dimension, compression_polynomial_size,
(compression_glwe_dimension + 1) * compression_polynomial_size,
lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
carry_modulus);
scratch_cuda_integer_decompress_radix_ciphertext_64(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_decompression<uint64_t> **)mem_ptr, num_lwes, encryption_params,
compression_params, storage_log_modulus, allocate_gpu_memory);
}
void cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
int8_t *mem_ptr) {
host_integer_compress<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(glwe_array_out),
static_cast<uint64_t *>(lwe_array_in), (uint64_t **)(fp_ksk), num_nths,
(int_compression<uint64_t> *)mem_ptr);
}
void cuda_integer_decompress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *glwe_in, void *indexes_array,
uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr) {
host_integer_decompress<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out), static_cast<uint64_t *>(glwe_in),
static_cast<uint32_t *>(indexes_array), indexes_array_size, bsks,
(int_decompression<uint64_t> *)mem_ptr);
}
void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_compression<uint64_t> *mem_ptr =
(int_compression<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
void cleanup_cuda_integer_decompress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_decompression<uint64_t> *mem_ptr =
(int_decompression<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}

View File

@@ -1,238 +0,0 @@
#ifndef CUDA_INTEGER_COMPRESSION_CUH
#define CUDA_INTEGER_COMPRESSION_CUH
#include "ciphertext.h"
#include "compression.h"
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer/integer.cuh"
#include "linearalgebra/multiplication.cuh"
#include "polynomial/functions.cuh"
#include "utils/kernel_dimensions.cuh"
template <typename Torus>
__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
uint32_t in_len, uint32_t len) {
auto nbits = sizeof(Torus) * 8;
auto i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < len) {
auto k = nbits * i / log_modulus;
auto j = k;
auto start_shift = i * nbits - j * log_modulus;
auto value = array_in[j] >> start_shift;
j++;
while (j * log_modulus < ((i + 1) * nbits) && j < in_len) {
auto shift = j * log_modulus - i * nbits;
value |= array_in[j] << shift;
j++;
}
array_out[i] = value;
}
}
template <typename Torus>
__host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
Torus *array_out, Torus *array_in, uint32_t num_inputs,
uint32_t body_count, int_compression<Torus> *mem_ptr) {
cudaSetDevice(gpu_index);
auto params = mem_ptr->compression_params;
auto log_modulus = mem_ptr->storage_log_modulus;
auto in_len = params.glwe_dimension * params.polynomial_size + body_count;
auto number_bits_to_pack = in_len * log_modulus;
auto nbits = sizeof(Torus) * 8;
// number_bits_to_pack.div_ceil(Scalar::BITS)
auto len = (number_bits_to_pack + nbits - 1) / nbits;
int num_blocks = 0, num_threads = 0;
getNumBlocksAndThreads(len, 128, num_blocks, num_threads);
dim3 grid(num_blocks);
dim3 threads(num_threads);
pack<<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus, in_len,
len);
}
template <typename Torus>
__host__ void host_integer_compress(cudaStream_t *streams,
uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *glwe_array_out, Torus *lwe_array_in,
Torus **fp_ksk, uint32_t num_lwes,
int_compression<Torus> *mem_ptr) {
auto compression_params = mem_ptr->compression_params;
auto input_lwe_dimension = compression_params.small_lwe_dimension;
// Shift
auto lwe_shifted = mem_ptr->tmp_lwe;
host_cleartext_multiplication(streams[0], gpu_indexes[0], lwe_shifted,
lwe_array_in,
(uint64_t)compression_params.message_modulus,
input_lwe_dimension, num_lwes);
uint32_t lwe_in_size = input_lwe_dimension + 1;
uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1;
// Keyswitch LWEs to GLWE
auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
auto fp_ks_buffer = mem_ptr->fp_ks_buffer;
for (int i = 0; i < num_glwes; i++) {
auto lwe_subset = lwe_shifted + i * lwe_in_size;
auto glwe_out = tmp_glwe_array_out + i * glwe_out_size;
host_packing_keyswitch_lwe_list_to_glwe(
streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
compression_params.polynomial_size, compression_params.ks_base_log,
compression_params.ks_level, min(num_lwes, mem_ptr->lwe_per_glwe));
}
auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);
// Modulus switch
host_modulus_switch_inplace(streams[0], gpu_indexes[0], tmp_glwe_array_out,
num_glwes *
(compression_params.glwe_dimension *
compression_params.polynomial_size +
body_count),
mem_ptr->storage_log_modulus);
check_cuda_error(cudaGetLastError());
host_pack(streams[0], gpu_indexes[0], glwe_array_out, tmp_glwe_array_out,
num_glwes, body_count, mem_ptr);
}
template <typename Torus>
__global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index,
uint32_t log_modulus, uint32_t initial_out_len) {
auto nbits = sizeof(Torus) * 8;
auto i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < initial_out_len) {
// Unpack
Torus mask = ((Torus)1 << log_modulus) - 1;
auto start = i * log_modulus;
auto end = (i + 1) * log_modulus;
auto start_block = start / nbits;
auto start_remainder = start % nbits;
auto end_block_inclusive = (end - 1) / nbits;
Torus unpacked_i;
if (start_block == end_block_inclusive) {
auto single_part = array_in[start_block] >> start_remainder;
unpacked_i = single_part & mask;
} else {
auto first_part = array_in[start_block] >> start_remainder;
auto second_part = array_in[start_block + 1] << (nbits - start_remainder);
unpacked_i = (first_part | second_part) & mask;
}
// Extract
glwe_array_out[i] = unpacked_i << (nbits - log_modulus);
}
}
template <typename Torus>
__host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
Torus *glwe_array_out, Torus *array_in,
uint32_t glwe_index,
int_decompression<Torus> *mem_ptr) {
cudaSetDevice(gpu_index);
auto params = mem_ptr->compression_params;
auto log_modulus = mem_ptr->storage_log_modulus;
uint32_t body_count = mem_ptr->body_count;
auto initial_out_len =
params.glwe_dimension * params.polynomial_size + body_count * body_count;
// We assure the tail of the glwe is zeroed
auto zeroed_slice =
glwe_array_out + params.glwe_dimension * params.polynomial_size;
cuda_memset_async(zeroed_slice, 0, params.polynomial_size * sizeof(Torus),
stream, gpu_index);
int num_blocks = 0, num_threads = 0;
getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
dim3 grid(num_blocks);
dim3 threads(num_threads);
extract<<<grid, threads, 0, stream>>>(glwe_array_out, array_in, glwe_index,
log_modulus, initial_out_len);
check_cuda_error(cudaGetLastError());
}
template <typename Torus>
__host__ void
host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
Torus *packed_glwe_in, uint32_t *indexes_array,
uint32_t indexes_array_size, void **bsks,
int_decompression<Torus> *mem_ptr) {
auto extracted_glwe = mem_ptr->tmp_extracted_glwe;
auto compression_params = mem_ptr->compression_params;
host_extract(streams[0], gpu_indexes[0], extracted_glwe, packed_glwe_in, 0,
mem_ptr);
auto num_lwes = mem_ptr->body_count;
// Sample extract
auto extracted_lwe = mem_ptr->tmp_extracted_lwe;
cuda_glwe_sample_extract_64(streams[0], gpu_indexes[0], extracted_lwe,
extracted_glwe, indexes_array, indexes_array_size,
compression_params.glwe_dimension,
compression_params.polynomial_size);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
auto encryption_params = mem_ptr->encryption_params;
auto carry_extract_lut = mem_ptr->carry_extract_lut;
execute_pbs_async<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out,
carry_extract_lut->lwe_indexes_out, carry_extract_lut->lut_vec,
carry_extract_lut->lut_indexes_vec, extracted_lwe,
carry_extract_lut->lwe_indexes_in, bsks, carry_extract_lut->buffer,
encryption_params.glwe_dimension,
compression_params.glwe_dimension * compression_params.polynomial_size,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor, num_lwes,
encryption_params.pbs_type);
}
template <typename Torus>
__host__ void scratch_cuda_compress_integer_radix_ciphertext_64(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_compression<Torus> **mem_ptr, uint32_t num_lwes,
int_radix_params compression_params, uint32_t lwe_per_glwe,
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
*mem_ptr = new int_compression<Torus>(
streams, gpu_indexes, gpu_count, compression_params, num_lwes,
lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
}
template <typename Torus>
__host__ void scratch_cuda_integer_decompress_radix_ciphertext_64(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_decompression<Torus> **mem_ptr, uint32_t num_lwes,
int_radix_params encryption_params, int_radix_params compression_params,
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
*mem_ptr = new int_decompression<Torus>(
streams, gpu_indexes, gpu_count, encryption_params, compression_params,
num_lwes, storage_log_modulus, allocate_gpu_memory);
}
#endif

View File

@@ -26,11 +26,54 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
host_integer_div_rem_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsks, (uint64_t **)(ksks), mem, num_blocks);
switch (mem->params.polynomial_size) {
case 512:
host_integer_div_rem_kb<uint64_t, Degree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsks, (uint64_t **)(ksks), mem, num_blocks);
break;
case 1024:
host_integer_div_rem_kb<uint64_t, Degree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsks, (uint64_t **)(ksks), mem, num_blocks);
break;
case 2048:
host_integer_div_rem_kb<uint64_t, Degree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsks, (uint64_t **)(ksks), mem, num_blocks);
break;
case 4096:
host_integer_div_rem_kb<uint64_t, Degree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsks, (uint64_t **)(ksks), mem, num_blocks);
break;
case 8192:
host_integer_div_rem_kb<uint64_t, Degree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsks, (uint64_t **)(ksks), mem, num_blocks);
break;
case 16384:
host_integer_div_rem_kb<uint64_t, Degree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
bsks, (uint64_t **)(ksks), mem, num_blocks);
break;
default:
PANIC("Cuda error (integer div_rem): unsupported polynomial size. "
"Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
}
}
void cleanup_cuda_integer_div_rem(void **streams, uint32_t *gpu_indexes,

View File

@@ -14,6 +14,7 @@
#include "utils/kernel_dimensions.cuh"
#include <fstream>
#include <iostream>
#include <omp.h>
#include <sstream>
#include <string>
#include <vector>
@@ -30,13 +31,17 @@ template <typename Torus> struct lwe_ciphertext_list {
int_radix_params params;
size_t big_lwe_size;
size_t radix_size;
size_t big_lwe_size_bytes;
size_t radix_size_bytes;
size_t big_lwe_dimension;
lwe_ciphertext_list(Torus *src, int_radix_params params, size_t max_blocks)
: data(src), params(params), max_blocks(max_blocks) {
big_lwe_size = params.big_lwe_dimension + 1;
big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
radix_size = max_blocks * big_lwe_size;
radix_size_bytes = radix_size * sizeof(Torus);
big_lwe_dimension = params.big_lwe_dimension;
len = max_blocks;
}
@@ -168,7 +173,7 @@ __host__ void scratch_cuda_integer_div_rem_kb(
streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
}
template <typename Torus>
template <typename Torus, class params>
__host__ void
host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *quotient, Torus *remainder,
@@ -371,19 +376,35 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
// interesting_divisor
trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
gpu_count);
// divisor_ms_blocks
trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
// interesting_remainder1
// numerator_block_stack
left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes,
gpu_count);
// interesting_remainder2
left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
gpu_count);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
#pragma omp parallel sections
{
#pragma omp section
{
// interesting_divisor
trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
gpu_count);
}
#pragma omp section
{
// divisor_ms_blocks
trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes,
gpu_count);
}
#pragma omp section
{
// interesting_remainder1
// numerator_block_stack
left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes,
gpu_count);
}
#pragma omp section
{
// interesting_remainder2
left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
gpu_count);
}
}
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
@@ -418,7 +439,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
// `subtraction_overflowed` - single ciphertext
auto do_overflowing_sub = [&](cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count) {
host_integer_overflowing_sub_kb<Torus>(
host_integer_overflowing_sub_kb<Torus, params>(
streams, gpu_indexes, gpu_count, new_remainder.data,
subtraction_overflowed.data, merged_interesting_remainder.data,
interesting_divisor.data, bsks, ksks, mem_ptr->overflow_sub_mem,
@@ -472,15 +493,28 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
// new_remainder
// subtraction_overflowed
do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
// at_least_one_upper_block_is_non_zero
check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
// cleaned_merged_interesting_remainder
create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
gpu_indexes, gpu_count);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
#pragma omp parallel sections
{
#pragma omp section
{
// new_remainder
// subtraction_overflowed
do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
}
#pragma omp section
{
// at_least_one_upper_block_is_non_zero
check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes,
gpu_count);
}
#pragma omp section
{
// cleaned_merged_interesting_remainder
create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
gpu_indexes, gpu_count);
}
}
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
@@ -537,15 +571,27 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
// cleaned_merged_interesting_remainder
conditionally_zero_out_merged_interesting_remainder(mem_ptr->sub_streams_1,
gpu_indexes, gpu_count);
// new_remainder
conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2,
gpu_indexes, gpu_count);
// quotient
set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
#pragma omp parallel sections
{
#pragma omp section
{
// cleaned_merged_interesting_remainder
conditionally_zero_out_merged_interesting_remainder(
mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
}
#pragma omp section
{
// new_remainder
conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2,
gpu_indexes, gpu_count);
}
#pragma omp section
{
// quotient
set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
}
}
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
@@ -571,13 +617,22 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
integer_radix_apply_univariate_lookup_table_kb(
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
integer_radix_apply_univariate_lookup_table_kb(
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
ksks, num_blocks, mem_ptr->message_extract_lut_2);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
#pragma omp parallel sections
{
#pragma omp section
{
integer_radix_apply_univariate_lookup_table_kb(
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
}
#pragma omp section
{
integer_radix_apply_univariate_lookup_table_kb(
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient,
bsks, ksks, num_blocks, mem_ptr->message_extract_lut_2);
}
}
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
}

View File

@@ -19,8 +19,9 @@ void scratch_cuda_full_propagation_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
@@ -28,7 +29,8 @@ void scratch_cuda_full_propagation_64(
scratch_cuda_full_propagation<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count,
(int_fullprop_buffer<uint64_t> **)mem_ptr, params, allocate_gpu_memory);
(int_fullprop_buffer<uint64_t> **)mem_ptr, params, num_radix_blocks,
allocate_gpu_memory);
}
void cleanup_cuda_full_propagation(void **streams, uint32_t *gpu_indexes,
@@ -66,18 +68,6 @@ void cuda_propagate_single_carry_kb_64_inplace(
host_propagate_single_carry<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
nullptr, (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), num_blocks);
}
void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
void *carry_out, void *input_carries, int8_t *mem_ptr, void **bsks,
void **ksks, uint32_t num_blocks) {
host_propagate_single_carry<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
static_cast<uint64_t *>(input_carries),
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
num_blocks);
}
@@ -173,55 +163,3 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams,
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus);
scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
num_radix_blocks, params, allocate_gpu_memory);
}
void cuda_integer_compute_prefix_sum_hillis_steele_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
void **bsks, uint32_t num_blocks, uint32_t shift) {
int_radix_params params = ((int_radix_lut<uint64_t> *)mem_ptr)->params;
host_compute_prefix_sum_hillis_steele<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(output_radix_lwe),
static_cast<uint64_t *>(input_radix_lwe), params,
(int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
num_blocks);
}
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
void cuda_integer_reverse_blocks_64_inplace(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count, void *lwe_array,
uint32_t num_blocks,
uint32_t lwe_size) {
host_radix_blocks_reverse_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes,
static_cast<uint64_t *>(lwe_array), num_blocks, lwe_size);
}

View File

@@ -3,7 +3,6 @@
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "helper_multi_gpu.h"
#include "integer.h"
#include "integer/scalar_addition.cuh"
#include "linear_algebra.h"
@@ -11,7 +10,6 @@
#include "polynomial/functions.cuh"
#include "programmable_bootstrap.h"
#include "utils/helper.cuh"
#include "utils/helper_multi_gpu.cuh"
#include "utils/kernel_dimensions.cuh"
#include <functional>
@@ -22,19 +20,18 @@ template <typename Torus>
__global__ void radix_blocks_rotate_right(Torus *dst, Torus *src,
uint32_t value, uint32_t blocks_count,
uint32_t lwe_size) {
value %= blocks_count;
size_t tid = threadIdx.x;
if (tid < lwe_size) {
value %= blocks_count;
size_t src_block_id = blockIdx.x;
size_t dst_block_id = (src_block_id + value) % blocks_count;
size_t stride = blockDim.x;
size_t src_block_id = blockIdx.x;
size_t dst_block_id = (src_block_id + value) % blocks_count;
size_t stride = blockDim.x;
auto cur_src_block = &src[src_block_id * lwe_size];
auto cur_dst_block = &dst[dst_block_id * lwe_size];
auto cur_src_block = &src[src_block_id * lwe_size];
auto cur_dst_block = &dst[dst_block_id * lwe_size];
for (size_t i = tid; i < lwe_size; i += stride) {
cur_dst_block[i] = cur_src_block[i];
}
for (size_t i = tid; i < lwe_size; i += stride) {
cur_dst_block[i] = cur_src_block[i];
}
}
@@ -45,28 +42,25 @@ template <typename Torus>
__global__ void radix_blocks_rotate_left(Torus *dst, Torus *src, uint32_t value,
uint32_t blocks_count,
uint32_t lwe_size) {
value %= blocks_count;
size_t src_block_id = blockIdx.x;
size_t tid = threadIdx.x;
if (tid < lwe_size) {
value %= blocks_count;
size_t src_block_id = blockIdx.x;
size_t dst_block_id = (src_block_id >= value)
? src_block_id - value
: src_block_id - value + blocks_count;
size_t stride = blockDim.x;
size_t dst_block_id = (src_block_id >= value)
? src_block_id - value
: src_block_id - value + blocks_count;
size_t stride = blockDim.x;
auto cur_src_block = &src[src_block_id * lwe_size];
auto cur_dst_block = &dst[dst_block_id * lwe_size];
auto cur_src_block = &src[src_block_id * lwe_size];
auto cur_dst_block = &dst[dst_block_id * lwe_size];
for (size_t i = tid; i < lwe_size; i += stride) {
cur_dst_block[i] = cur_src_block[i];
}
for (size_t i = tid; i < lwe_size; i += stride) {
cur_dst_block[i] = cur_src_block[i];
}
}
// rotate radix ciphertext right with specific value
// calculation is not inplace, so `dst` and `src` must not be the same
// one block is responsible to process single lwe ciphertext
template <typename Torus>
__host__ void
host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes,
@@ -99,35 +93,6 @@ host_radix_blocks_rotate_left(cudaStream_t *streams, uint32_t *gpu_indexes,
dst, src, value, blocks_count, lwe_size);
}
// reverse the blocks in a list
// each cuda block swaps a couple of blocks
template <typename Torus>
__global__ void radix_blocks_reverse_lwe_inplace(Torus *src,
uint32_t blocks_count,
uint32_t lwe_size) {
size_t idx = blockIdx.x;
size_t rev_idx = blocks_count - 1 - idx;
for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
Torus back_element = src[rev_idx * lwe_size + j];
Torus front_element = src[idx * lwe_size + j];
src[idx * lwe_size + j] = back_element;
src[rev_idx * lwe_size + j] = front_element;
}
}
template <typename Torus>
__host__ void
host_radix_blocks_reverse_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
Torus *src, uint32_t blocks_count,
uint32_t lwe_size) {
cudaSetDevice(gpu_indexes[0]);
int num_blocks = blocks_count / 2, num_threads = 1024;
radix_blocks_reverse_lwe_inplace<<<num_blocks, num_threads, 0, streams[0]>>>(
src, blocks_count, lwe_size);
}
// polynomial_size threads
template <typename Torus>
__global__ void
@@ -188,67 +153,28 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
auto polynomial_size = params.polynomial_size;
auto grouping_factor = params.grouping_factor;
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count,
lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes,
lwe_array_in, lut->lwe_indexes_in, ksks,
big_lwe_dimension, small_lwe_dimension, ks_base_log,
ks_level, num_radix_blocks, false);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lut->lwe_indexes_out,
lut->lut_vec, lut->lut_indexes_vec, lut->tmp_lwe_after_ks,
lut->lwe_trivial_indexes, bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
/// Synchronize all GPUs
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
if (active_gpu_count == 1) {
execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], lwe_array_in,
lut->lwe_indexes_in, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, pbs_type);
} else {
/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_in,
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
big_lwe_dimension + 1);
/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_after_ks_vec, lwe_trivial_indexes_vec,
lwe_array_in_vec, lwe_trivial_indexes_vec,
ksks, big_lwe_dimension, small_lwe_dimension,
ks_base_log, ks_level, num_radix_blocks);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, pbs_type);
/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_array_out, lwe_after_pbs_vec,
lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
@@ -279,63 +205,29 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
num_radix_blocks);
check_cuda_error(cudaGetLastError());
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count,
lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes,
lwe_array_pbs_in, lut->lwe_indexes_in, ksks,
big_lwe_dimension, small_lwe_dimension, ks_base_log,
ks_level, num_radix_blocks, false);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lut->lwe_indexes_out,
lut->lut_vec, lut->lut_indexes_vec, lut->tmp_lwe_after_ks,
lut->lwe_trivial_indexes, bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
/// Synchronize all GPUs
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
if (active_gpu_count == 1) {
execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], lwe_array_pbs_in,
lut->lwe_indexes_in, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, pbs_type);
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
lwe_array_pbs_in, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1);
/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_after_ks_vec, lwe_trivial_indexes_vec,
lwe_array_in_vec, lwe_trivial_indexes_vec,
ksks, big_lwe_dimension, small_lwe_dimension,
ks_base_log, ks_level, num_radix_blocks);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, pbs_type);
/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_array_out, lwe_after_pbs_vec,
lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
@@ -433,6 +325,7 @@ void generate_device_accumulator_bivariate(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
uint32_t carry_modulus, std::function<Torus(Torus, Torus)> f) {
cudaSetDevice(gpu_index);
// host lut
Torus *h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
@@ -442,14 +335,14 @@ void generate_device_accumulator_bivariate(
message_modulus, carry_modulus, f);
// copy host lut and lut_indexes_vec to device
cuda_synchronize_stream(stream, gpu_index);
cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
(glwe_dimension + 1) * polynomial_size *
sizeof(Torus),
stream, gpu_index);
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
// Release memory when possible
cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
h_lut);
}
/*
@@ -465,6 +358,7 @@ void generate_device_accumulator_bivariate_with_factor(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
uint32_t carry_modulus, std::function<Torus(Torus, Torus)> f, int factor) {
cudaSetDevice(gpu_index);
// host lut
Torus *h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
@@ -474,15 +368,15 @@ void generate_device_accumulator_bivariate_with_factor(
h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f,
factor);
cuda_synchronize_stream(stream, gpu_index);
// copy host lut and lut_indexes_vec to device
cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
(glwe_dimension + 1) * polynomial_size *
sizeof(Torus),
stream, gpu_index);
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
// Release memory when possible
cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
h_lut);
}
/*
@@ -500,6 +394,7 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
uint32_t carry_modulus,
std::function<Torus(Torus)> f) {
cudaSetDevice(gpu_index);
// host lut
Torus *h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
@@ -508,14 +403,14 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f);
cuda_synchronize_stream(stream, gpu_index);
// copy host lut and lut_indexes_vec to device
cuda_memcpy_async_to_gpu(
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
stream, gpu_index);
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
// Release memory when possible
cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
h_lut);
}
template <typename Torus>
@@ -529,47 +424,10 @@ void scratch_cuda_propagate_single_carry_kb_inplace(
num_radix_blocks, allocate_gpu_memory);
}
template <typename Torus>
void host_compute_prefix_sum_hillis_steele(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *step_output, Torus *generates_or_propagates, int_radix_params params,
int_radix_lut<Torus> *luts, void **bsks, Torus **ksks,
uint32_t num_blocks) {
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto big_lwe_size = glwe_dimension * polynomial_size + 1;
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
int num_steps = ceil(log2((double)num_blocks));
int space = 1;
cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
big_lwe_size_bytes * num_blocks, streams[0],
gpu_indexes[0]);
for (int step = 0; step < num_steps; step++) {
if (space > num_blocks - 1)
PANIC("Cuda error: step output is going out of bounds in Hillis Steele "
"propagation")
auto cur_blocks = &step_output[space * big_lwe_size];
auto prev_blocks = generates_or_propagates;
int cur_total_blocks = num_blocks - space;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
bsks, ksks, cur_total_blocks, luts, luts->params.message_modulus);
cuda_memcpy_async_gpu_to_gpu(
&generates_or_propagates[space * big_lwe_size], cur_blocks,
big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
space *= 2;
}
}
template <typename Torus>
void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array,
Torus *carry_out, Torus *input_carries,
Torus *carry_out,
int_sc_prop_memory<Torus> *mem, void **bsks,
Torus **ksks, uint32_t num_blocks) {
auto params = mem->params;
@@ -590,9 +448,29 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
ksks, num_blocks, luts_array);
// compute prefix sum with hillis&steele
host_compute_prefix_sum_hillis_steele(
streams, gpu_indexes, gpu_count, step_output, generates_or_propagates,
params, luts_carry_propagation_sum, bsks, ksks, num_blocks);
int num_steps = ceil(log2((double)num_blocks));
int space = 1;
cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
big_lwe_size_bytes * num_blocks, streams[0],
gpu_indexes[0]);
for (int step = 0; step < num_steps; step++) {
auto cur_blocks = &step_output[space * big_lwe_size];
auto prev_blocks = generates_or_propagates;
int cur_total_blocks = num_blocks - space;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
bsks, ksks, cur_total_blocks, luts_carry_propagation_sum,
luts_carry_propagation_sum->params.message_modulus);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(
&generates_or_propagates[space * big_lwe_size], cur_blocks,
big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
space *= 2;
}
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
generates_or_propagates, 1, num_blocks,
@@ -604,12 +482,6 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
if (input_carries != nullptr) {
cuda_memcpy_async_gpu_to_gpu(input_carries, step_output,
big_lwe_size_bytes * num_blocks, streams[0],
gpu_indexes[0]);
}
host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, step_output,
glwe_dimension * polynomial_size, num_blocks);
@@ -618,24 +490,11 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
num_blocks, message_acc);
}
template <typename Torus>
void host_generate_last_block_inner_propagation(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *last_block_inner_propagation, Torus *lhs, Torus *rhs,
int_last_block_inner_propagate_memory<Torus> *mem, void **bsks,
Torus **ksks) {
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, last_block_inner_propagation, lhs, rhs,
bsks, ksks, 1, mem->last_block_inner_propagation_lut,
mem->params.message_modulus);
}
template <typename Torus>
void host_propagate_single_sub_borrow(cudaStream_t *streams,
uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *overflowed, Torus *lwe_array,
int_overflowing_sub_memory<Torus> *mem,
int_single_borrow_prop_memory<Torus> *mem,
void **bsks, Torus **ksks,
uint32_t num_blocks) {
auto params = mem->params;
@@ -656,9 +515,27 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
ksks, num_blocks, luts_array);
// compute prefix sum with hillis&steele
host_compute_prefix_sum_hillis_steele<Torus>(
streams, gpu_indexes, gpu_count, step_output, generates_or_propagates,
params, luts_carry_propagation_sum, bsks, ksks, num_blocks);
int num_steps = ceil(log2((double)num_blocks));
int space = 1;
cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
big_lwe_size_bytes * num_blocks, streams[0],
gpu_indexes[0]);
for (int step = 0; step < num_steps; step++) {
auto cur_blocks = &step_output[space * big_lwe_size];
auto prev_blocks = generates_or_propagates;
int cur_total_blocks = num_blocks - space;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
bsks, ksks, cur_total_blocks, luts_carry_propagation_sum,
luts_carry_propagation_sum->params.message_modulus);
cuda_memcpy_async_gpu_to_gpu(
&generates_or_propagates[space * big_lwe_size], cur_blocks,
big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
space *= 2;
}
cuda_memcpy_async_gpu_to_gpu(
overflowed, &generates_or_propagates[big_lwe_size * (num_blocks - 1)],
@@ -700,11 +577,12 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
for (int i = 0; i < num_blocks; i++) {
auto cur_input_block = &input_blocks[i * big_lwe_size];
cudaSetDevice(gpu_indexes[0]);
/// Since the keyswitch is done on one input only, use only 1 GPU
execute_keyswitch_async<Torus>(
streams, gpu_indexes, 1, mem_ptr->tmp_small_lwe_vector,
cuda_keyswitch_lwe_ciphertext_vector<Torus>(
streams[0], gpu_indexes[0], mem_ptr->tmp_small_lwe_vector,
mem_ptr->lut->lwe_trivial_indexes, cur_input_block,
mem_ptr->lut->lwe_trivial_indexes, ksks, params.big_lwe_dimension,
mem_ptr->lut->lwe_trivial_indexes, ksks[0], params.big_lwe_dimension,
params.small_lwe_dimension, params.ks_base_log, params.ks_level, 1);
cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size],
@@ -712,14 +590,15 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
small_lwe_size * sizeof(Torus), streams[0],
gpu_indexes[0]);
execute_pbs_async<Torus>(
execute_pbs<Torus>(
streams, gpu_indexes, 1, mem_ptr->tmp_big_lwe_vector,
mem_ptr->lut->lwe_trivial_indexes, mem_ptr->lut->lut_vec,
mem_ptr->lut->lut_indexes_vec, mem_ptr->tmp_small_lwe_vector,
mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
params.glwe_dimension, params.small_lwe_dimension,
params.polynomial_size, params.pbs_base_log, params.pbs_level,
params.grouping_factor, 2, params.pbs_type);
params.grouping_factor, 2, 2, 0,
cuda_get_max_shared_memory(gpu_indexes[0]), params.pbs_type);
cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
big_lwe_size * sizeof(Torus), streams[0],
@@ -740,10 +619,12 @@ void scratch_cuda_full_propagation(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count,
int_fullprop_buffer<Torus> **mem_ptr,
int_radix_params params,
uint32_t num_radix_blocks,
bool allocate_gpu_memory) {
*mem_ptr = new int_fullprop_buffer<Torus>(streams, gpu_indexes, gpu_count,
params, allocate_gpu_memory);
*mem_ptr =
new int_fullprop_buffer<Torus>(streams, gpu_indexes, gpu_count, params,
num_radix_blocks, allocate_gpu_memory);
}
// (lwe_dimension+1) threads
@@ -765,7 +646,7 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
}
if (num_radix_blocks % 2 == 1) {
// We couldn't host_pack the last block, so we just copy it
// We couldn't pack the last block, so we just copy it
Torus *lsb_block =
lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
Torus *last_block =
@@ -788,9 +669,8 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
Torus *lwe_array_out, Torus *lwe_array_in,
uint32_t lwe_dimension, uint32_t num_radix_blocks,
uint32_t factor) {
if (num_radix_blocks == 0)
return;
cudaSetDevice(gpu_index);
int num_blocks = 0, num_threads = 0;
int num_entries = (lwe_dimension + 1);
getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads);

View File

@@ -71,7 +71,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
uint32_t max_shared_memory, bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
polynomial_size * glwe_dimension, lwe_dimension,
@@ -123,6 +123,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
* - 'num_blocks' is the number of big lwe ciphertext blocks inside radix
* ciphertext
* - 'pbs_type' selects which PBS implementation should be used
* - 'max_shared_memory' maximum shared memory per cuda block
*/
void cuda_integer_mult_radix_ciphertext_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -132,7 +133,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
switch (polynomial_size) {
case 256:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<256>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
@@ -140,7 +141,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 512:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
@@ -148,7 +149,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 1024:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
@@ -156,7 +157,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 2048:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
@@ -164,7 +165,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 4096:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
@@ -172,7 +173,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 8192:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
@@ -180,7 +181,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 16384:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_left),
@@ -202,7 +203,7 @@ void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
@@ -215,13 +216,13 @@ void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus);
scratch_cuda_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
scratch_cuda_integer_sum_ciphertexts_vec_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
max_num_radix_in_vec, params, allocate_gpu_memory);
}
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix) {
@@ -237,47 +238,42 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
switch (mem->params.polynomial_size) {
case 512:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
break;
case 1024:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<1024>>(
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
break;
case 2048:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<2048>>(
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
break;
case 4096:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<4096>>(
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
break;
case 8192:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<8192>>(
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
break;
case 16384:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<16384>>(
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
@@ -291,9 +287,10 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
free(terms_degree);
}
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
void cleanup_cuda_integer_radix_sum_ciphertexts_vec(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr =
(int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);

View File

@@ -8,13 +8,11 @@
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "helper_multi_gpu.h"
#include "integer.h"
#include "integer/integer.cuh"
#include "linear_algebra.h"
#include "programmable_bootstrap.h"
#include "utils/helper.cuh"
#include "utils/helper_multi_gpu.cuh"
#include "utils/kernel_dimensions.cuh"
#include <fstream>
#include <iostream>
@@ -93,11 +91,15 @@ all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
}
}
template <typename Torus>
template <typename Torus, sharedMemDegree SMD>
__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
uint32_t chunk_size, uint32_t block_size,
uint32_t num_blocks) {
extern __shared__ int8_t sharedmem[];
Torus *result = (Torus *)sharedmem;
size_t stride = blockDim.x;
size_t chunk_id = blockIdx.x;
size_t chunk_elem_size = chunk_size * num_blocks * block_size;
@@ -105,7 +107,10 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
size_t block_stride = blockIdx.y * block_size;
auto result = &dst_radix[block_stride];
auto dst_block = &dst_radix[block_stride];
if constexpr (SMD == NOSM)
result = dst_block;
// init shared mem with first radix of chunk
size_t tid = threadIdx.x;
@@ -120,12 +125,18 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
result[i] += cur_src_radix[block_stride + i];
}
}
// put result from shared mem to global mem
if constexpr (SMD == FULLSM)
for (int i = tid; i < block_size; i += stride)
dst_block[i] = result[i];
}
template <typename Torus, class params>
__global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
Torus *msb_blocks,
uint32_t glwe_dimension,
uint32_t lsb_count, uint32_t msb_count,
uint32_t num_blocks) {
size_t big_lwe_dimension = glwe_dimension * params::degree + 1;
size_t big_lwe_id = blockIdx.x;
@@ -169,24 +180,38 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
}
}
template <typename Torus>
__host__ void scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
__host__ void scratch_cuda_integer_sum_ciphertexts_vec_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
int_radix_params params, bool allocate_gpu_memory) {
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<Torus, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(
cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
check_cuda_error(cudaGetLastError());
}
*mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks_in_radix,
max_num_radix_in_vec, allocate_gpu_memory);
}
template <typename Torus, class params>
__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
__host__ void host_integer_sum_ciphertexts_vec_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *radix_lwe_out, Torus *terms, int *terms_degree, void **bsks,
uint64_t **ksks, int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec,
int_radix_lut<Torus> *reused_lut = nullptr) {
uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec) {
auto new_blocks = mem_ptr->new_blocks;
auto old_blocks = mem_ptr->old_blocks;
@@ -198,12 +223,11 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
auto message_modulus = mem_ptr->params.message_modulus;
auto carry_modulus = mem_ptr->params.carry_modulus;
auto num_blocks = num_blocks_in_radix;
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
auto big_lwe_size = mem_ptr->params.big_lwe_dimension + 1;
auto glwe_dimension = mem_ptr->params.glwe_dimension;
auto polynomial_size = mem_ptr->params.polynomial_size;
auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
auto small_lwe_size = small_lwe_dimension + 1;
auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
if (old_blocks != terms) {
cuda_memcpy_async_gpu_to_gpu(old_blocks, terms,
@@ -222,48 +246,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
int32_t h_smart_copy_in[r * num_blocks];
int32_t h_smart_copy_out[r * num_blocks];
/// Here it is important to query the default max shared memory on device 0
/// instead of cuda_get_max_shared_memory,
/// to avoid bugs with tree_add_chunks trying to use too much shared memory
int max_shared_memory = 0;
check_cuda_error(cudaDeviceGetAttribute(
&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
// create lut object for message and carry
// we allocate luts_message_carry in the host function (instead of scratch)
// to reduce average memory consumption
int_radix_lut<Torus> *luts_message_carry;
size_t ch_amount = r / chunk_size;
if (!ch_amount)
ch_amount++;
if (reused_lut == nullptr) {
luts_message_carry = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
2 * ch_amount * num_blocks, true);
} else {
luts_message_carry = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
2 * ch_amount * num_blocks, reused_lut);
}
auto message_acc = luts_message_carry->get_lut(gpu_indexes[0], 0);
auto carry_acc = luts_message_carry->get_lut(gpu_indexes[0], 1);
// define functions for each accumulator
auto lut_f_message = [message_modulus](Torus x) -> Torus {
return x % message_modulus;
};
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
return x / message_modulus;
};
// generate accumulators
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], message_acc, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, lut_f_message);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], carry_acc, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, lut_f_carry);
luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_indexes[0]);
while (r > 2) {
size_t cur_total_blocks = r * num_blocks;
@@ -274,8 +257,12 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
size_t sm_size = big_lwe_size * sizeof(Torus);
cudaSetDevice(gpu_indexes[0]);
tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
if (sm_size < max_shared_memory)
tree_add_chunks<Torus, FULLSM><<<add_grid, 512, sm_size, streams[0]>>>(
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
else
tree_add_chunks<Torus, NOSM><<<add_grid, 512, 0, streams[0]>>>(
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
check_cuda_error(cudaGetLastError());
@@ -288,21 +275,46 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
terms_degree, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max,
total_count, message_count, carry_count, sm_copy_count);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
// create lut object for message and carry
// we allocate luts_message_carry in the host function (instead of scratch)
// to reduce average memory consumption
auto luts_message_carry = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->params, 2, total_count, true);
auto message_acc = luts_message_carry->get_lut(gpu_indexes[0], 0);
auto carry_acc = luts_message_carry->get_lut(gpu_indexes[0], 1);
// define functions for each accumulator
auto lut_f_message = [message_modulus](Torus x) -> Torus {
return x % message_modulus;
};
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
return x / message_modulus;
};
// generate accumulators
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], message_acc, glwe_dimension,
polynomial_size, message_modulus, carry_modulus, lut_f_message);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], carry_acc, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, lut_f_carry);
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
h_lwe_idx_in, h_lwe_idx_out);
size_t copy_size = sm_copy_count * sizeof(int32_t);
size_t copy_size = total_count * sizeof(Torus);
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_idx_in, copy_size,
streams[0], gpu_indexes[0]);
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_idx_out, copy_size,
streams[0], gpu_indexes[0]);
copy_size = sm_copy_count * sizeof(int32_t);
cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
streams[0], gpu_indexes[0]);
cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
streams[0], gpu_indexes[0]);
// inside d_smart_copy_in there are only -1 values
// it's fine to call smart_copy with same pointer
// as source and destination
smart_copy<<<sm_copy_count, 1024, 0, streams[0]>>>(
new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
big_lwe_size);
@@ -316,97 +328,28 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
std::vector<Torus *> small_lwe_vector_vec =
luts_message_carry->lwe_after_ks_vec;
std::vector<Torus *> lwe_after_pbs_vec =
luts_message_carry->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec =
luts_message_carry->lwe_trivial_indexes_vec;
auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
if (active_gpu_count == 1) {
/// Apply KS to go from a big LWE dimension to a small LWE dimension
/// After this keyswitch execution, we need to synchronize the streams
/// because the keyswitch and PBS do not operate on the same number of
/// inputs
execute_keyswitch_async<Torus>(
streams, gpu_indexes, 1, small_lwe_vector, lwe_indexes_in, new_blocks,
lwe_indexes_in, ksks, polynomial_size * glwe_dimension,
small_lwe_dimension, mem_ptr->params.ks_base_log,
mem_ptr->params.ks_level, message_count);
/// Apply KS to go from a big LWE dimension to a small LWE dimension
/// After this keyswitch execution, we need to synchronize the streams
/// because the keyswitch and PBS do not operate on the same number of
/// inputs
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, small_lwe_vector,
lwe_indexes_in, new_blocks, lwe_indexes_in, ksks,
polynomial_size * glwe_dimension, lwe_dimension,
mem_ptr->params.ks_base_log,
mem_ptr->params.ks_level, message_count, true);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
streams, gpu_indexes, 1, new_blocks, lwe_indexes_out,
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
small_lwe_vector, lwe_indexes_in, bsks, luts_message_carry->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, total_count,
mem_ptr->params.pbs_type);
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, new_blocks_vec, new_blocks,
luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes, message_count,
big_lwe_size);
/// Apply KS to go from a big LWE dimension to a small LWE dimension
/// After this keyswitch execution, we need to synchronize the streams
/// because the keyswitch and PBS do not operate on the same number of
/// inputs
execute_keyswitch_async<Torus>(
streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
ksks, big_lwe_dimension, small_lwe_dimension,
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);
/// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
/// different configuration
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, gpu_count, small_lwe_vector,
small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes, message_count,
small_lwe_size);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
small_lwe_vector, luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes, total_count,
small_lwe_size);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
lwe_trivial_indexes_vec, bsks, luts_message_carry->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, total_count,
mem_ptr->params.pbs_type);
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, new_blocks, lwe_after_pbs_vec,
luts_message_carry->h_lwe_indexes_out,
luts_message_carry->using_trivial_lwe_indexes, total_count,
big_lwe_size);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs<Torus>(streams, gpu_indexes, gpu_count, new_blocks,
lwe_indexes_out, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec, small_lwe_vector,
lwe_indexes_in, bsks, luts_message_carry->buffer,
glwe_dimension, lwe_dimension, polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, total_count, 2, 0,
max_shared_memory, mem_ptr->params.pbs_type, true);
luts_message_carry->release(streams, gpu_indexes, gpu_count);
int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0;
int new_blocks_created = 2 * ch_amount * num_blocks;
@@ -419,15 +362,17 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
std::swap(new_blocks, old_blocks);
r = (new_blocks_created + rem_blocks) / num_blocks;
}
luts_message_carry->release(streams, gpu_indexes, gpu_count);
delete (luts_message_carry);
host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
&old_blocks[num_blocks * big_lwe_size], big_lwe_dimension,
num_blocks);
host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
radix_lwe_out, nullptr, mem_ptr->scp_mem,
bsks, ksks, num_blocks);
}
template <typename Torus, class params>
template <typename Torus, typename STorus, class params>
__host__ void host_integer_mult_radix_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
uint64_t *radix_lwe_out, uint64_t *radix_lwe_left,
@@ -519,7 +464,8 @@ __host__ void host_integer_mult_radix_kb(
fill_radix_from_lsb_msb<Torus, params>
<<<num_blocks * num_blocks, params::degree / params::opt, 0,
streams[0]>>>(vector_result_sb, vector_result_lsb, vector_result_msb,
glwe_dimension, num_blocks);
glwe_dimension, lsb_vector_block_count,
msb_vector_block_count, num_blocks);
check_cuda_error(cudaGetLastError());
int terms_degree[2 * num_blocks * num_blocks];
@@ -535,15 +481,10 @@ __host__ void host_integer_mult_radix_kb(
terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
}
host_integer_partial_sum_ciphertexts_vec_kb<Torus, params>(
host_integer_sum_ciphertexts_vec_kb<Torus, params>(
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb,
terms_degree, bsks, ksks, mem_ptr->sum_ciphertexts_mem, num_blocks,
2 * num_blocks, mem_ptr->luts_array);
auto scp_mem_ptr = mem_ptr->sum_ciphertexts_mem->scp_mem;
host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
radix_lwe_out, nullptr, nullptr,
scp_mem_ptr, bsks, ksks, num_blocks);
2 * num_blocks);
}
template <typename Torus>
@@ -551,6 +492,22 @@ __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_mul_memory<Torus> **mem_ptr, uint32_t num_radix_blocks,
int_radix_params params, bool allocate_gpu_memory) {
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<Torus, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(
cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
check_cuda_error(cudaGetLastError());
}
*mem_ptr = new int_mul_memory<Torus>(streams, gpu_indexes, gpu_count, params,
num_radix_blocks, allocate_gpu_memory);
}

View File

@@ -38,13 +38,65 @@ void cuda_integer_radix_overflowing_sub_kb_64(
auto mem = (int_overflowing_sub_memory<uint64_t> *)mem_ptr;
host_integer_overflowing_sub_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_overflowed),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks), mem,
num_blocks);
switch (mem->params.polynomial_size) {
case 512:
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_overflowed),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
mem, num_blocks);
break;
case 1024:
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_overflowed),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
mem, num_blocks);
break;
case 2048:
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_overflowed),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
mem, num_blocks);
break;
case 4096:
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_overflowed),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
mem, num_blocks);
break;
case 8192:
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_overflowed),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
mem, num_blocks);
break;
case 16384:
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_overflowed),
static_cast<uint64_t *>(radix_lwe_left),
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
mem, num_blocks);
break;
default:
PANIC("Cuda error (integer overflowing sub): unsupported polynomial size. "
"Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
}
}
void cleanup_cuda_integer_radix_overflowing_sub(void **streams,

View File

@@ -98,7 +98,7 @@ __host__ void scratch_cuda_integer_overflowing_sub_kb(
streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
}
template <typename Torus>
template <typename Torus, class params>
__host__ void host_integer_overflowing_sub_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *radix_lwe_out, Torus *radix_lwe_overflowed, Torus *radix_lwe_left,
@@ -113,9 +113,9 @@ __host__ void host_integer_overflowing_sub_kb(
radix_params.message_modulus, radix_params.carry_modulus,
radix_params.message_modulus - 1);
host_propagate_single_sub_borrow<Torus>(streams, gpu_indexes, gpu_count,
radix_lwe_overflowed, radix_lwe_out,
mem_ptr, bsks, ksks, num_blocks);
host_propagate_single_sub_borrow<Torus>(
streams, gpu_indexes, gpu_count, radix_lwe_overflowed, radix_lwe_out,
mem_ptr->borrow_prop_mem, bsks, ksks, num_blocks);
}
#endif

View File

@@ -2,6 +2,7 @@
#define CUDA_INTEGER_SCALAR_COMPARISON_OPS_CUH
#include "integer/comparison.cuh"
#include <omp.h>
template <typename Torus>
__host__ void integer_radix_unsigned_scalar_difference_check_kb(
@@ -86,43 +87,53 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
//////////////
// lsb
Torus *lhs = diff_buffer->tmp_packed_left;
Torus *rhs = diff_buffer->tmp_packed_right;
#pragma omp parallel sections
{
// Both sections may be executed in parallel
#pragma omp section
{
//////////////
// lsb
Torus *lhs = diff_buffer->tmp_packed_left;
Torus *rhs = diff_buffer->tmp_packed_right;
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
total_num_scalar_blocks, message_modulus);
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
total_num_scalar_blocks, message_modulus);
// From this point we have half number of blocks
num_lsb_radix_blocks /= 2;
num_lsb_radix_blocks += (total_num_scalar_blocks % 2);
// From this point we have half number of blocks
num_lsb_radix_blocks /= 2;
num_lsb_radix_blocks += (total_num_scalar_blocks % 2);
// comparisons will be assigned
// - 0 if lhs < rhs
// - 1 if lhs == rhs
// - 2 if lhs > rhs
// comparisons will be assigned
// - 0 if lhs < rhs
// - 1 if lhs == rhs
// - 2 if lhs > rhs
auto comparisons = mem_ptr->tmp_block_comparisons;
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
comparisons, lhs, rhs, mem_ptr, bsks, ksks,
num_lsb_radix_blocks);
auto comparisons = mem_ptr->tmp_block_comparisons;
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
comparisons, lhs, rhs, mem_ptr, bsks,
ksks, num_lsb_radix_blocks);
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
comparisons, mem_ptr->diff_buffer->tree_buffer,
mem_ptr->identity_lut_f, bsks, ksks,
num_lsb_radix_blocks);
//////////////
// msb
host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
lwe_array_msb_out, msb, mem_ptr, bsks, ksks,
num_msb_radix_blocks, mem_ptr->is_zero_lut);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction(
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks,
ksks, num_lsb_radix_blocks);
}
#pragma omp section
{
//////////////
// msb
host_compare_with_zero_equality(
msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb,
mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
}
}
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
}
@@ -194,6 +205,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
cudaSetDevice(gpu_indexes[0]);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto glwe_dimension = params.glwe_dimension;
@@ -299,83 +311,93 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
//////////////
// lsb
Torus *lhs = diff_buffer->tmp_packed_left;
Torus *rhs = diff_buffer->tmp_packed_right;
#pragma omp parallel sections
{
// Both sections may be executed in parallel
#pragma omp section
{
//////////////
// lsb
Torus *lhs = diff_buffer->tmp_packed_left;
Torus *rhs = diff_buffer->tmp_packed_right;
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
total_num_scalar_blocks, message_modulus);
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
total_num_scalar_blocks, message_modulus);
// From this point we have half number of blocks
num_lsb_radix_blocks /= 2;
num_lsb_radix_blocks += (total_num_scalar_blocks % 2);
// From this point we have half number of blocks
num_lsb_radix_blocks /= 2;
num_lsb_radix_blocks += (total_num_scalar_blocks % 2);
// comparisons will be assigned
// - 0 if lhs < rhs
// - 1 if lhs == rhs
// - 2 if lhs > rhs
// comparisons will be assigned
// - 0 if lhs < rhs
// - 1 if lhs == rhs
// - 2 if lhs > rhs
auto comparisons = mem_ptr->tmp_block_comparisons;
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
comparisons, lhs, rhs, mem_ptr, bsks, ksks,
num_lsb_radix_blocks);
auto comparisons = mem_ptr->tmp_block_comparisons;
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
comparisons, lhs, rhs, mem_ptr, bsks,
ksks, num_lsb_radix_blocks);
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
comparisons, mem_ptr->diff_buffer->tree_buffer,
mem_ptr->identity_lut_f, bsks, ksks,
num_lsb_radix_blocks);
//////////////
// msb
// We remove the last block (which is the sign)
Torus *are_all_msb_zeros = lwe_array_msb_out;
host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
are_all_msb_zeros, msb, mem_ptr, bsks, ksks,
num_msb_radix_blocks, mem_ptr->is_zero_lut);
auto sign_bit_pos = (int)log2(message_modulus) - 1;
auto lut_f = [mem_ptr, sign_bit_pos](Torus sign_block,
Torus msb_are_zeros) {
bool sign_bit_is_set = (sign_block >> sign_bit_pos) == 1;
CMP_ORDERING sign_block_ordering;
if (sign_bit_is_set) {
sign_block_ordering = CMP_ORDERING::IS_INFERIOR;
} else if (sign_block != 0) {
sign_block_ordering = CMP_ORDERING::IS_SUPERIOR;
} else {
sign_block_ordering = CMP_ORDERING::IS_EQUAL;
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction(
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks,
ksks, num_lsb_radix_blocks);
}
#pragma omp section
{
//////////////
// msb
// We remove the last block (which is the sign)
Torus *are_all_msb_zeros = lwe_array_msb_out;
host_compare_with_zero_equality(
msb_streams, gpu_indexes, gpu_count, are_all_msb_zeros, msb,
mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
CMP_ORDERING msb_ordering;
if (msb_are_zeros == 1)
msb_ordering = CMP_ORDERING::IS_EQUAL;
else
msb_ordering = CMP_ORDERING::IS_SUPERIOR;
auto sign_bit_pos = (int)log2(message_modulus) - 1;
return mem_ptr->diff_buffer->tree_buffer->block_selector_f(
sign_block_ordering, msb_ordering);
};
auto lut_f = [mem_ptr, sign_bit_pos](Torus sign_block,
Torus msb_are_zeros) {
bool sign_bit_is_set = (sign_block >> sign_bit_pos) == 1;
CMP_ORDERING sign_block_ordering;
if (sign_bit_is_set) {
sign_block_ordering = CMP_ORDERING::IS_INFERIOR;
} else if (sign_block != 0) {
sign_block_ordering = CMP_ORDERING::IS_SUPERIOR;
} else {
sign_block_ordering = CMP_ORDERING::IS_EQUAL;
}
auto signed_msb_lut = mem_ptr->signed_msb_lut;
generate_device_accumulator_bivariate<Torus>(
msb_streams[0], gpu_indexes[0],
signed_msb_lut->get_lut(gpu_indexes[0], 0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f);
signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
CMP_ORDERING msb_ordering;
if (msb_are_zeros == 1)
msb_ordering = CMP_ORDERING::IS_EQUAL;
else
msb_ordering = CMP_ORDERING::IS_SUPERIOR;
Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
integer_radix_apply_bivariate_lookup_table_kb(
msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
signed_msb_lut->params.message_modulus);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
return mem_ptr->diff_buffer->tree_buffer->block_selector_f(
sign_block_ordering, msb_ordering);
};
auto signed_msb_lut = mem_ptr->signed_msb_lut;
generate_device_accumulator_bivariate<Torus>(
msb_streams[0], gpu_indexes[0],
signed_msb_lut->get_lut(gpu_indexes[0], 0), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f);
signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
integer_radix_apply_bivariate_lookup_table_kb(
msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
signed_msb_lut->params.message_modulus);
}
}
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
}
@@ -400,38 +422,50 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
auto lwe_array_ct_out = mem_ptr->tmp_lwe_array_out;
auto lwe_array_sign_out =
lwe_array_ct_out + (num_lsb_radix_blocks / 2) * big_lwe_size;
Torus *lhs = diff_buffer->tmp_packed_left;
Torus *rhs = diff_buffer->tmp_packed_right;
#pragma omp parallel sections
{
// Both sections may be executed in parallel
#pragma omp section
{
Torus *lhs = diff_buffer->tmp_packed_left;
Torus *rhs = diff_buffer->tmp_packed_right;
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
big_lwe_dimension, num_lsb_radix_blocks - 1, message_modulus);
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
num_lsb_radix_blocks - 1, message_modulus);
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
big_lwe_dimension, num_lsb_radix_blocks - 1,
message_modulus);
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
num_lsb_radix_blocks - 1, message_modulus);
// From this point we have half number of blocks
num_lsb_radix_blocks /= 2;
// From this point we have half number of blocks
num_lsb_radix_blocks /= 2;
// comparisons will be assigned
// - 0 if lhs < rhs
// - 1 if lhs == rhs
// - 2 if lhs > rhs
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
lwe_array_ct_out, lhs, rhs, mem_ptr, bsks,
ksks, num_lsb_radix_blocks);
Torus *encrypted_sign_block =
lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
Torus *scalar_sign_block = scalar_blocks + (total_num_scalar_blocks - 1);
// comparisons will be assigned
// - 0 if lhs < rhs
// - 1 if lhs == rhs
// - 2 if lhs > rhs
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
lwe_array_ct_out, lhs, rhs, mem_ptr,
bsks, ksks, num_lsb_radix_blocks);
}
#pragma omp section
{
Torus *encrypted_sign_block =
lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
Torus *scalar_sign_block =
scalar_blocks + (total_num_scalar_blocks - 1);
auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
create_trivial_radix(msb_streams[0], gpu_indexes[0], trivial_sign_block,
scalar_sign_block, big_lwe_dimension, 1, 1,
message_modulus, carry_modulus);
auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
create_trivial_radix(msb_streams[0], gpu_indexes[0], trivial_sign_block,
scalar_sign_block, big_lwe_dimension, 1, 1,
message_modulus, carry_modulus);
integer_radix_apply_bivariate_lookup_table_kb(
msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out,
encrypted_sign_block, trivial_sign_block, bsks, ksks, 1,
mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
integer_radix_apply_bivariate_lookup_table_kb(
msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out,
encrypted_sign_block, trivial_sign_block, bsks, ksks, 1,
mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
}
}
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
}
@@ -532,8 +566,6 @@ __host__ void scalar_compare_radix_blocks_kb(
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
uint32_t num_radix_blocks) {
if (num_radix_blocks == 0)
return;
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
@@ -654,47 +686,58 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
auto lsb_streams = mem_ptr->lsb_streams;
auto msb_streams = mem_ptr->msb_streams;
if (num_halved_scalar_blocks > 0) {
auto packed_blocks = mem_ptr->tmp_packed_input;
auto packed_scalar =
packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;
#pragma omp parallel sections
{
// Both sections may be executed in parallel
#pragma omp section
{
if (num_halved_scalar_blocks > 0) {
auto packed_blocks = mem_ptr->tmp_packed_input;
auto packed_scalar =
packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;
pack_blocks(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
pack_blocks(lsb_streams[0], gpu_indexes[0], packed_scalar, scalar_blocks, 0,
num_scalar_blocks, message_modulus);
pack_blocks(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
pack_blocks(lsb_streams[0], gpu_indexes[0], packed_scalar,
scalar_blocks, 0, num_scalar_blocks, message_modulus);
cuda_memcpy_async_gpu_to_gpu(
scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0),
packed_scalar, num_halved_scalar_blocks * sizeof(Torus), lsb_streams[0],
gpu_indexes[0]);
scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);
cuda_memcpy_async_gpu_to_gpu(
scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0),
packed_scalar, num_halved_scalar_blocks * sizeof(Torus),
lsb_streams[0], gpu_indexes[0]);
scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);
integer_radix_apply_univariate_lookup_table_kb(
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, packed_blocks,
bsks, ksks, num_halved_lsb_radix_blocks, scalar_comparison_luts);
}
//////////////
// msb
if (num_msb_radix_blocks > 0) {
int_radix_lut<Torus> *msb_lut;
switch (mem_ptr->op) {
case COMPARISON_TYPE::EQ:
msb_lut = mem_ptr->is_zero_lut;
break;
case COMPARISON_TYPE::NE:
msb_lut = mem_ptr->eq_buffer->is_non_zero_lut;
break;
default:
PANIC("Cuda error: integer operation not supported")
integer_radix_apply_univariate_lookup_table_kb(
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
packed_blocks, bsks, ksks, num_halved_lsb_radix_blocks,
scalar_comparison_luts);
}
}
#pragma omp section
{
//////////////
// msb
if (num_msb_radix_blocks > 0) {
int_radix_lut<Torus> *msb_lut;
switch (mem_ptr->op) {
case COMPARISON_TYPE::EQ:
msb_lut = mem_ptr->is_zero_lut;
break;
case COMPARISON_TYPE::NE:
msb_lut = mem_ptr->eq_buffer->is_non_zero_lut;
break;
default:
PANIC("Cuda error: integer operation not supported")
}
host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
lwe_array_msb_out, msb, mem_ptr, bsks, ksks,
num_msb_radix_blocks, msb_lut);
host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
lwe_array_msb_out, msb, mem_ptr, bsks,
ksks, num_msb_radix_blocks, msb_lut);
}
}
}
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
}

Some files were not shown because too many files have changed in this diff Show More