mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-04-28 03:01:21 -04:00
Compare commits
196 Commits
csprng-0.4
...
pa/refacto
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5df67f7666 | ||
|
|
3397aa81d2 | ||
|
|
8f10f8f8db | ||
|
|
92be95c6b8 | ||
|
|
990c4d0380 | ||
|
|
1d5abfd5ea | ||
|
|
dfd1beeb47 | ||
|
|
43a007a2fa | ||
|
|
54faf64ecd | ||
|
|
8fe7f9c3cb | ||
|
|
9ed65db03d | ||
|
|
9413d3e722 | ||
|
|
2000feb87e | ||
|
|
594a5cee25 | ||
|
|
401cfc5fd0 | ||
|
|
769c725c67 | ||
|
|
07d143e032 | ||
|
|
d88bba761b | ||
|
|
eaa1d07f90 | ||
|
|
663322cfa5 | ||
|
|
ddd6a6e136 | ||
|
|
abc39f0a3e | ||
|
|
8b7556667b | ||
|
|
67b1607773 | ||
|
|
5340859003 | ||
|
|
a26e68c3bc | ||
|
|
0dd622ebb9 | ||
|
|
d69dd20079 | ||
|
|
80fe45f354 | ||
|
|
33114e3946 | ||
|
|
ede0745b7f | ||
|
|
bc4cd08e7a | ||
|
|
b03921f1ae | ||
|
|
70f7af06f5 | ||
|
|
a9bb6eac5f | ||
|
|
4fa9b243e0 | ||
|
|
b88f561358 | ||
|
|
0e71ca6c1c | ||
|
|
3ba61c0694 | ||
|
|
781f78c442 | ||
|
|
ebfc1ea8ac | ||
|
|
7fa9f33776 | ||
|
|
5547d92c79 | ||
|
|
351fc476b5 | ||
|
|
53cd3c8d0f | ||
|
|
0a2ad8ca72 | ||
|
|
eba4f6a89c | ||
|
|
4b933cf421 | ||
|
|
3303cd8568 | ||
|
|
f937524f64 | ||
|
|
e7da96271c | ||
|
|
0cc716544b | ||
|
|
f53087b5ed | ||
|
|
bcefe977c9 | ||
|
|
73ea24fd51 | ||
|
|
6f1a9bdaa5 | ||
|
|
7834f699d0 | ||
|
|
b81692b2df | ||
|
|
8748d1cc22 | ||
|
|
dbb13aa35e | ||
|
|
53f4c9bfc7 | ||
|
|
4021812248 | ||
|
|
190c5e7bb7 | ||
|
|
2004333d6e | ||
|
|
e7c06ef956 | ||
|
|
7b14fe6fee | ||
|
|
55f4df97b4 | ||
|
|
2144ec8107 | ||
|
|
fb862ddbbc | ||
|
|
ab0b01f7e1 | ||
|
|
6c4318b8bb | ||
|
|
d3f2ecd367 | ||
|
|
19dc0f02f9 | ||
|
|
95d50368fa | ||
|
|
c117798b10 | ||
|
|
da0934d4bc | ||
|
|
b522de3273 | ||
|
|
9205703454 | ||
|
|
a1b92a6db8 | ||
|
|
8d7c45bf17 | ||
|
|
91f05b00b9 | ||
|
|
ebb11b15c4 | ||
|
|
18270714d8 | ||
|
|
6c6525b1ea | ||
|
|
79f8971712 | ||
|
|
17db09bf2a | ||
|
|
fc9bfcaf61 | ||
|
|
d93c412dc5 | ||
|
|
ea222007d8 | ||
|
|
3470d6c2d8 | ||
|
|
fffdc3862e | ||
|
|
d9eca01631 | ||
|
|
95ef13f6ce | ||
|
|
230fa5a8f0 | ||
|
|
b443855b8b | ||
|
|
ba80c33328 | ||
|
|
e5dc45c084 | ||
|
|
b450f0eb30 | ||
|
|
7479cc826b | ||
|
|
b2beac2d2c | ||
|
|
b700416597 | ||
|
|
42609987a1 | ||
|
|
5b37a838ba | ||
|
|
c1fcd95d72 | ||
|
|
ffb8b4f930 | ||
|
|
3b8dace975 | ||
|
|
44f326824f | ||
|
|
f41d133fc7 | ||
|
|
52d43961b8 | ||
|
|
35b89704aa | ||
|
|
b578cf19c2 | ||
|
|
dd68ce67ad | ||
|
|
f8d8cc90fe | ||
|
|
eac37a7749 | ||
|
|
4342efecc8 | ||
|
|
a3ec84729d | ||
|
|
90d6b221d7 | ||
|
|
b1491734b2 | ||
|
|
436dd6a687 | ||
|
|
39534cb4c4 | ||
|
|
723443589d | ||
|
|
d58a1b68cb | ||
|
|
b29c477462 | ||
|
|
bed3d88426 | ||
|
|
35201b06b6 | ||
|
|
c8ddc0f008 | ||
|
|
4d934f512a | ||
|
|
52b0907c47 | ||
|
|
8ea647dc26 | ||
|
|
8f72677fa6 | ||
|
|
36a58cf16c | ||
|
|
de79f3a280 | ||
|
|
e9051419cd | ||
|
|
ac37c3883d | ||
|
|
72fb770308 | ||
|
|
34d07f5558 | ||
|
|
4176b3dcb5 | ||
|
|
abf9c3efb7 | ||
|
|
ebf1fd9e84 | ||
|
|
cef055b7f3 | ||
|
|
d65a4d8690 | ||
|
|
928bc13ed2 | ||
|
|
c81abae989 | ||
|
|
aff50fcb85 | ||
|
|
757606fdb4 | ||
|
|
7542c89679 | ||
|
|
dd74063959 | ||
|
|
f6845a988b | ||
|
|
6a3ff21de2 | ||
|
|
74cafd0e9d | ||
|
|
d8241942a6 | ||
|
|
46f0bf442a | ||
|
|
81c837c837 | ||
|
|
7b96f55900 | ||
|
|
f19e892053 | ||
|
|
2a989d64f9 | ||
|
|
eeb4accf66 | ||
|
|
0370bf6a3f | ||
|
|
a62c19b735 | ||
|
|
721a5a57ba | ||
|
|
3f101d5e8b | ||
|
|
e01f4abb65 | ||
|
|
a2ca189283 | ||
|
|
e0e9668b0b | ||
|
|
bd23d18c9d | ||
|
|
491112ffc1 | ||
|
|
83c3dadb5d | ||
|
|
7692643ca4 | ||
|
|
29cf2b83b8 | ||
|
|
1b47c74360 | ||
|
|
cd329729d7 | ||
|
|
8ec24d1bb7 | ||
|
|
13f61e4d67 | ||
|
|
72475a385e | ||
|
|
cc8f2cb4dc | ||
|
|
a153ea98ae | ||
|
|
60773497fe | ||
|
|
d632c916c2 | ||
|
|
6e4ea82db8 | ||
|
|
a7df399de3 | ||
|
|
90dc9a004e | ||
|
|
a4508f8396 | ||
|
|
c8e1998167 | ||
|
|
85d3ba6238 | ||
|
|
e9772953bf | ||
|
|
c407f3d5a6 | ||
|
|
5f0bff98dd | ||
|
|
634b7ada32 | ||
|
|
734edb3bdc | ||
|
|
ee181506c4 | ||
|
|
cf1576efbd | ||
|
|
d215359a75 | ||
|
|
1b5d5eeb94 | ||
|
|
bbaaa53656 | ||
|
|
88ad88e71c | ||
|
|
f338df0079 |
1
.github/actionlint.yaml
vendored
1
.github/actionlint.yaml
vendored
@@ -4,6 +4,7 @@ self-hosted-runner:
|
||||
- m1mac
|
||||
- 4090-desktop
|
||||
- large_windows_16_latest
|
||||
- large_ubuntu_16
|
||||
# Configuration variables in array of strings defined in your repository or
|
||||
# organization. `null` means disabling configuration variables check.
|
||||
# Empty array means no configuration variable is allowed.
|
||||
|
||||
120
.github/workflows/aws_tfhe_backward_compat_tests.yml
vendored
Normal file
120
.github/workflows/aws_tfhe_backward_compat_tests.yml
vendored
Normal file
@@ -0,0 +1,120 @@
|
||||
# Run backward compatibility tests
|
||||
name: Backward compatibility Tests on CPU
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (backward-compat-tests)
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
profile: cpu-small
|
||||
|
||||
backward-compat-tests:
|
||||
name: Backward compatibility tests
|
||||
needs: [ setup-instance ]
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Install git-lfs
|
||||
run: |
|
||||
sudo apt update && sudo apt -y install git-lfs
|
||||
|
||||
- name: Use specific data branch
|
||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
|
||||
env:
|
||||
PR_BRANCH: ${{ github.head_ref || github.ref_name }}
|
||||
run: |
|
||||
echo "BACKWARD_COMPAT_DATA_BRANCH=${PR_BRANCH}" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Get backward compat branch
|
||||
id: backward_compat_branch
|
||||
run: |
|
||||
BRANCH="$(make backward_compat_branch)"
|
||||
echo "branch=${BRANCH}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Clone test data
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
repository: zama-ai/tfhe-backward-compat-data
|
||||
path: tfhe/tfhe-backward-compat-data
|
||||
lfs: 'true'
|
||||
ref: ${{ steps.backward_compat_branch.outputs.branch }}
|
||||
|
||||
- name: Run backward compatibility tests
|
||||
run: |
|
||||
make test_backward_compatibility_ci
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Backward compatibility tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (backward-compat-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, backward-compat-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (backward-compat-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
144
.github/workflows/aws_tfhe_fast_tests.yml
vendored
144
.github/workflows/aws_tfhe_fast_tests.yml
vendored
@@ -1,4 +1,4 @@
|
||||
# Run a small subset of shortint and integer tests to ensure quick feedback.
|
||||
# Run a small subset of tests to ensure quick feedback.
|
||||
name: Fast AWS Tests on CPU
|
||||
|
||||
env:
|
||||
@@ -11,6 +11,7 @@ env:
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
@@ -18,15 +19,112 @@ on:
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
|
||||
zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
|
||||
core_crypto_test: ${{ env.IS_PULL_REQUEST == 'false' ||
|
||||
steps.changed-files.outputs.core_crypto_any_changed ||
|
||||
steps.changed-files.outputs.dependencies_any_changed }}
|
||||
boolean_test: ${{ env.IS_PULL_REQUEST == 'false' ||
|
||||
steps.changed-files.outputs.boolean_any_changed ||
|
||||
steps.changed-files.outputs.dependencies_any_changed }}
|
||||
shortint_test: ${{ env.IS_PULL_REQUEST == 'false' ||
|
||||
steps.changed-files.outputs.shortint_any_changed ||
|
||||
steps.changed-files.outputs.dependencies_any_changed }}
|
||||
integer_test: ${{ env.IS_PULL_REQUEST == 'false' ||
|
||||
steps.changed-files.outputs.integer_any_changed ||
|
||||
steps.changed-files.outputs.dependencies_any_changed }}
|
||||
wasm_test: ${{ env.IS_PULL_REQUEST == 'false' ||
|
||||
steps.changed-files.outputs.wasm_any_changed ||
|
||||
steps.changed-files.outputs.dependencies_any_changed }}
|
||||
high_level_api_test: ${{ env.IS_PULL_REQUEST == 'false' ||
|
||||
steps.changed-files.outputs.high_level_api_any_changed ||
|
||||
steps.changed-files.outputs.dependencies_any_changed }}
|
||||
user_docs_test: ${{ env.IS_PULL_REQUEST == 'false' ||
|
||||
steps.changed-files.outputs.user_docs_any_changed ||
|
||||
steps.changed-files.outputs.dependencies_any_changed }}
|
||||
any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
dependencies:
|
||||
- tfhe/Cargo.toml
|
||||
- concrete-csprng/**
|
||||
- tfhe-zk-pok/**
|
||||
csprng:
|
||||
- concrete-csprng/**
|
||||
zk_pok:
|
||||
- tfhe-zk-pok/**
|
||||
core_crypto:
|
||||
- tfhe/src/core_crypto/**
|
||||
boolean:
|
||||
- tfhe/src/core_crypto/**
|
||||
- tfhe/src/boolean/**
|
||||
shortint:
|
||||
- tfhe/src/core_crypto/**
|
||||
- tfhe/src/shortint/**
|
||||
integer:
|
||||
- tfhe/src/core_crypto/**
|
||||
- tfhe/src/shortint/**
|
||||
- tfhe/src/integer/**
|
||||
wasm:
|
||||
- tfhe/src/**
|
||||
- tfhe/js_on_wasm_tests/**
|
||||
- tfhe/web_wasm_parallel_tests/**
|
||||
- '!tfhe/src/c_api/**'
|
||||
- '!tfhe/src/boolean/**'
|
||||
high_level_api:
|
||||
- tfhe/src/**
|
||||
- '!tfhe/src/c_api/**'
|
||||
- '!tfhe/src/boolean/**'
|
||||
- '!tfhe/src/c_api/**'
|
||||
- '!tfhe/src/js_on_wasm_api/**'
|
||||
user_docs:
|
||||
- tfhe/src/**
|
||||
- '!tfhe/src/c_api/**'
|
||||
- 'tfhe/docs/**.md'
|
||||
- README.md
|
||||
|
||||
- name: Aggregate file changes
|
||||
id: aggregated-changes
|
||||
if: ( steps.changed-files.outputs.dependencies_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.csprng_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.boolean_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.shortint_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.integer_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.wasm_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.high_level_api_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.user_docs_any_changed == 'true')
|
||||
run: |
|
||||
echo "any_changed=true" >> "$GITHUB_OUTPUT"
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (fast-tests)
|
||||
if: github.event_name != 'pull_request' ||
|
||||
needs.should-run.outputs.any_file_changed == 'true'
|
||||
needs: should-run
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -37,7 +135,9 @@ jobs:
|
||||
|
||||
fast-tests:
|
||||
name: Fast CPU tests
|
||||
needs: setup-instance
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
|
||||
needs: [ should-run, setup-instance ]
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
@@ -53,59 +153,58 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Install git-lfs
|
||||
run: |
|
||||
sudo apt update && sudo apt -y install git-lfs
|
||||
|
||||
- name: Run concrete-csprng tests
|
||||
if: needs.should-run.outputs.csprng_test == 'true'
|
||||
run: |
|
||||
make test_concrete_csprng
|
||||
|
||||
- name: Run tfhe-zk-pok tests
|
||||
if: needs.should-run.outputs.zk_pok_test == 'true'
|
||||
run: |
|
||||
make test_zk_pok
|
||||
|
||||
- name: Run core tests
|
||||
if: needs.should-run.outputs.core_crypto_test == 'true'
|
||||
run: |
|
||||
AVX512_SUPPORT=ON make test_core_crypto
|
||||
|
||||
- name: Run boolean tests
|
||||
if: needs.should-run.outputs.boolean_test == 'true'
|
||||
run: |
|
||||
make test_boolean
|
||||
|
||||
- name: Run user docs tests
|
||||
if: needs.should-run.outputs.user_docs_test == 'true'
|
||||
run: |
|
||||
make test_user_doc
|
||||
|
||||
- name: Run js on wasm API tests
|
||||
if: needs.should-run.outputs.wasm_test == 'true'
|
||||
run: |
|
||||
make test_nodejs_wasm_api_in_docker
|
||||
|
||||
- name: Gen Keys if required
|
||||
if: needs.should-run.outputs.shortint_test == 'true' ||
|
||||
needs.should-run.outputs.integer_test == 'true'
|
||||
run: |
|
||||
make gen_key_cache
|
||||
|
||||
- name: Run shortint tests
|
||||
if: needs.should-run.outputs.shortint_test == 'true'
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE FAST_TESTS=TRUE make test_shortint_ci
|
||||
|
||||
- name: Run integer tests
|
||||
if: needs.should-run.outputs.integer_test == 'true'
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE FAST_TESTS=TRUE make test_integer_ci
|
||||
|
||||
- name: Run shortint multi-bit tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE FAST_TESTS=TRUE make test_shortint_multi_bit_ci
|
||||
|
||||
- name: Run integer multi-bit tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE FAST_TESTS=TRUE make test_integer_multi_bit_ci
|
||||
|
||||
- name: Run high-level API tests
|
||||
if: needs.should-run.outputs.high_level_api_test == 'true'
|
||||
run: |
|
||||
make test_high_level_api
|
||||
|
||||
@@ -113,17 +212,6 @@ jobs:
|
||||
run: |
|
||||
make test_safe_deserialization
|
||||
|
||||
- name: Clone test data
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
repository: zama-ai/tfhe-backward-compat-data
|
||||
path: tfhe/tfhe-backward-compat-data
|
||||
lfs: 'true'
|
||||
|
||||
- name: Run backward compatibility tests
|
||||
run: |
|
||||
make test_backward_compatibility_ci
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
@@ -140,7 +228,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
54
.github/workflows/aws_tfhe_integer_tests.yml
vendored
54
.github/workflows/aws_tfhe_integer_tests.yml
vendored
@@ -19,25 +19,55 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
types: [labeled]
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
schedule:
|
||||
# Nightly tests @ 3AM after each work day
|
||||
- cron: "0 3 * * MON-FRI"
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
integer_test: ${{ github.event_name == 'workflow_dispatch' ||
|
||||
steps.changed-files.outputs.integer_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: "false"
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
integer:
|
||||
- tfhe/Cargo.toml
|
||||
- concrete-csprng/**
|
||||
- tfhe-zk-pok/**
|
||||
- tfhe/src/core_crypto/**
|
||||
- tfhe/src/shortint/**
|
||||
- tfhe/src/integer/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (unsigned-integer-tests)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
|
||||
needs: should-run
|
||||
if:
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
|
||||
github.event_name == 'workflow_dispatch'
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -50,21 +80,21 @@ jobs:
|
||||
name: Unsigned integer tests
|
||||
needs: setup-instance
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
group: ${{ github.workflow }}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
persist-credentials: "false"
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -100,12 +130,12 @@ jobs:
|
||||
teardown-instance:
|
||||
name: Teardown instance (unsigned-integer-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, unsigned-integer-tests ]
|
||||
needs: [setup-instance, unsigned-integer-tests]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -19,25 +19,55 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
types: [labeled]
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
schedule:
|
||||
# Nightly tests @ 3AM after each work day
|
||||
- cron: "0 3 * * MON-FRI"
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
integer_test: ${{ github.event_name == 'workflow_dispatch' ||
|
||||
steps.changed-files.outputs.integer_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: "false"
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
integer:
|
||||
- tfhe/Cargo.toml
|
||||
- concrete-csprng/**
|
||||
- tfhe-zk-pok/**
|
||||
- tfhe/src/core_crypto/**
|
||||
- tfhe/src/shortint/**
|
||||
- tfhe/src/integer/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (signed-integer-tests)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
|
||||
name: Setup instance (unsigned-integer-tests)
|
||||
needs: should-run
|
||||
if:
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
|
||||
github.event_name == 'workflow_dispatch'
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -50,21 +80,21 @@ jobs:
|
||||
name: Signed integer tests
|
||||
needs: setup-instance
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
group: ${{ github.workflow }}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
persist-credentials: "false"
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -104,12 +134,12 @@ jobs:
|
||||
teardown-instance:
|
||||
name: Teardown instance (signed-integer-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, signed-integer-tests ]
|
||||
needs: [setup-instance, signed-integer-tests]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
12
.github/workflows/aws_tfhe_tests.yml
vendored
12
.github/workflows/aws_tfhe_tests.yml
vendored
@@ -63,7 +63,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@eaf854ef0c266753e1abec356dcf17d92695b251
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -86,6 +86,8 @@ jobs:
|
||||
high_level_api:
|
||||
- tfhe/src/**
|
||||
- '!tfhe/src/c_api/**'
|
||||
- '!tfhe/src/boolean/**'
|
||||
- '!tfhe/src/js_on_wasm_api/**'
|
||||
c_api:
|
||||
- tfhe/src/**
|
||||
examples:
|
||||
@@ -121,7 +123,7 @@ jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cpu-tests)
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event_name == 'pull_request' && needs.should-run.outputs.any_file_changed == 'true')
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.any_file_changed == 'true')
|
||||
needs: should-run
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
@@ -129,7 +131,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -158,7 +160,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -235,7 +237,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
6
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
6
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
@@ -27,7 +27,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -54,7 +54,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -90,7 +90,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
119
.github/workflows/boolean_benchmark.yml
vendored
119
.github/workflows/boolean_benchmark.yml
vendored
@@ -3,30 +3,9 @@ name: Boolean benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
# This input is not used in this workflow but still mandatory since a calling workflow could
|
||||
# use it. If a triggering command include a user_inputs field, then the triggered workflow
|
||||
# must include this very input, otherwise the workflow won't be called.
|
||||
# See start_full_benchmarks.yml as example.
|
||||
user_inputs:
|
||||
description: "Type of benchmarks to run"
|
||||
type: string
|
||||
default: "weekly_benchmarks"
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
- cron: '0 1 * * 6'
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
@@ -34,36 +13,60 @@ env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
jobs:
|
||||
run-boolean-benchmarks:
|
||||
name: Execute boolean benchmarks in EC2
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
setup-instance:
|
||||
name: Setup instance (boolean-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'schedule' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Get benchmark date
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
profile: bench
|
||||
|
||||
boolean-benchmarks:
|
||||
name: Execute boolean benchmarks in EC2
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -73,14 +76,12 @@ jobs:
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
|
||||
COMMIT_HASH="$(git describe --tags --dirty)"
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--hardware "hpc7a.96xlarge" \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
@@ -97,7 +98,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
with:
|
||||
name: ${{ github.sha }}_boolean
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -129,8 +130,28 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Boolean benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (boolean-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, boolean-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (boolean-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
19
.github/workflows/cargo_build.yml
vendored
19
.github/workflows/cargo_build.yml
vendored
@@ -19,14 +19,21 @@ jobs:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest-large, large_windows_16_latest]
|
||||
# GitHub macos-latest are now M1 macs, so use ours, we limit what runs so it will be fast
|
||||
# even with a few PRs
|
||||
os: [large_ubuntu_16, macos-latest, windows-latest]
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Install and run newline linter checks
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
wget https://github.com/fernandrone/linelint/releases/download/0.0.6/linelint-linux-amd64
|
||||
echo "16b70fb7b471d6f95cbdc0b4e5dc2b0ac9e84ba9ecdc488f7bdf13df823aca4b linelint-linux-amd64" > checksum
|
||||
@@ -36,27 +43,33 @@ jobs:
|
||||
make check_newline
|
||||
|
||||
- name: Run pcc checks
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make pcc
|
||||
|
||||
- name: Build concrete-csprng
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_concrete_csprng
|
||||
|
||||
- name: Build Release core
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_core AVX512_SUPPORT=ON
|
||||
make build_core_experimental AVX512_SUPPORT=ON
|
||||
|
||||
- name: Build Release boolean
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_boolean
|
||||
|
||||
- name: Build Release shortint
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_shortint
|
||||
|
||||
- name: Build Release integer
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_integer
|
||||
|
||||
@@ -65,10 +78,12 @@ jobs:
|
||||
make build_tfhe_full
|
||||
|
||||
- name: Build Release c_api
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_c_api
|
||||
|
||||
- name: Build coverage tests
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_tfhe_coverage
|
||||
|
||||
|
||||
2
.github/workflows/check_commit.yml
vendored
2
.github/workflows/check_commit.yml
vendored
@@ -10,7 +10,7 @@ jobs:
|
||||
- name: Check first line
|
||||
uses: gsactions/commit-message-checker@16fa2d5de096ae0d35626443bcd24f1e756cafee
|
||||
with:
|
||||
pattern: '^((feat|fix|chore|refactor|style|test|docs|doc)(\([\w\-_]+\))?\:) .+$'
|
||||
pattern: '^((feat|fix|chore|refactor|style|test|docs|doc)(\([\w\-_]+\))?\!?\:) .+$'
|
||||
flags: "gs"
|
||||
error: 'Your first line has to contain a commit type and scope like "feat(my_feature): msg".'
|
||||
excludeDescription: "true" # optional: this excludes the description body of a pull request
|
||||
|
||||
8
.github/workflows/code_coverage.yml
vendored
8
.github/workflows/code_coverage.yml
vendored
@@ -25,7 +25,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -51,13 +51,13 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@eaf854ef0c266753e1abec356dcf17d92695b251
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
with:
|
||||
files_yaml: |
|
||||
tfhe:
|
||||
@@ -125,7 +125,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
115
.github/workflows/core_crypto_benchmark.yml
vendored
115
.github/workflows/core_crypto_benchmark.yml
vendored
@@ -3,30 +3,6 @@ name: Core crypto benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
# This input is not used in this workflow but still mandatory since a calling workflow could
|
||||
# use it. If a triggering command include a user_inputs field, then the triggered workflow
|
||||
# must include this very input, otherwise the workflow won't be called.
|
||||
# See start_full_benchmarks.yml as example.
|
||||
user_inputs:
|
||||
description: "Type of benchmarks to run"
|
||||
type: string
|
||||
default: "weekly_benchmarks"
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
@@ -34,36 +10,59 @@ env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
jobs:
|
||||
run-core-crypto-benchmarks:
|
||||
name: Execute core crypto benchmarks in EC2
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
setup-instance:
|
||||
name: Setup instance (core-crypto-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'schedule' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Get benchmark date
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
profile: bench
|
||||
|
||||
core-crypto-benchmarks:
|
||||
name: Execute core crypto benchmarks in EC2
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -75,21 +74,19 @@ jobs:
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
|
||||
COMMIT_HASH="$(git describe --tags --dirty)"
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--hardware "hpc7a.96xlarge" \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--name-suffix avx512 \
|
||||
--walk-subdirs \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -121,8 +118,28 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "PBS benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (core-crypto-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, core-crypto-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (core-crypto-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
10
.github/workflows/core_crypto_gpu_benchmark.yml
vendored
10
.github/workflows/core_crypto_gpu_benchmark.yml
vendored
@@ -27,7 +27,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -50,7 +50,7 @@ jobs:
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.1
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
@@ -83,7 +83,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -128,7 +128,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -175,7 +175,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -27,7 +27,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -54,7 +54,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -78,7 +78,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
123
.github/workflows/data_pr_close.yml
vendored
Normal file
123
.github/workflows/data_pr_close.yml
vendored
Normal file
@@ -0,0 +1,123 @@
|
||||
name: Close or Merge corresponding PR on the data repo
|
||||
|
||||
# When a PR with the data_PR tag is closed or merged, this will close the corresponding PR in the data repo.
|
||||
|
||||
env:
|
||||
TARGET_REPO_API_URL: ${{ github.api_url }}/repos/zama-ai/tfhe-backward-compat-data
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
PR_BRANCH: ${{ github.head_ref || github.ref_name }}
|
||||
CLOSE_TYPE: ${{ github.event.pull_request.merged && 'merge' || 'close' }}
|
||||
|
||||
# only trigger on pull request closed events
|
||||
on:
|
||||
pull_request:
|
||||
types: [ closed ]
|
||||
|
||||
# The same pattern is used for jobs that use the github api:
|
||||
# - save the result of the API call in the env var "GH_API_RES". Since the var is multiline
|
||||
# we use this trick: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#example-of-a-multiline-string
|
||||
# - "set +e" will make sure we reach the last "echo EOF" even in case of error
|
||||
# - "set -o" pipefail makes one line piped command return the error of the first failure
|
||||
# - 'RES="$?"' and 'exit $RES' are used to return the error code if a command failed. Without it, with "set +e"
|
||||
# the script will always return 0 because of the "echo EOF".
|
||||
|
||||
|
||||
jobs:
|
||||
auto_close_job:
|
||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Find corresponding Pull Request in the data repo
|
||||
run: |
|
||||
{
|
||||
set +e
|
||||
set -o pipefail
|
||||
echo 'TARGET_REPO_PR<<EOF'
|
||||
curl --fail-with-body --no-progress-meter -L -X GET \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
${{ env.TARGET_REPO_API_URL }}/pulls\?head=${{ github.repository_owner }}:${{ env.PR_BRANCH }} | jq -e '.[0]' | sed 's/null/{ "message": "corresponding PR not found" }/'
|
||||
RES="$?"
|
||||
echo EOF
|
||||
} >> "${GITHUB_ENV}"
|
||||
exit $RES
|
||||
|
||||
- name: Comment on the PR to indicate the reason of the close
|
||||
run: |
|
||||
{
|
||||
set +e
|
||||
set -o pipefail
|
||||
echo 'GH_API_RES<<EOF'
|
||||
curl --fail-with-body --no-progress-meter -L -X POST \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
${{ fromJson(env.TARGET_REPO_PR).comments_url }} \
|
||||
-d '{ "body": "PR ${{ env.CLOSE_TYPE }}d because the corresponding PR in main repo was ${{ env.CLOSE_TYPE }}d: ${{ github.repository }}#${{ github.event.number }}" }'
|
||||
RES="$?"
|
||||
echo EOF
|
||||
} >> "${GITHUB_ENV}"
|
||||
exit $RES
|
||||
|
||||
- name: Merge the Pull Request in the data repo
|
||||
if: ${{ github.event.pull_request.merged }}
|
||||
run: |
|
||||
{
|
||||
set +e
|
||||
set -o pipefail
|
||||
echo 'GH_API_RES<<EOF'
|
||||
curl --fail-with-body --no-progress-meter -L -X PUT \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
${{ fromJson(env.TARGET_REPO_PR).url }}/merge \
|
||||
-d '{ "merge_method": "rebase" }'
|
||||
RES="$?"
|
||||
echo EOF
|
||||
} >> "${GITHUB_ENV}"
|
||||
exit $RES
|
||||
|
||||
- name: Close the Pull Request in the data repo
|
||||
if: ${{ !github.event.pull_request.merged }}
|
||||
run: |
|
||||
{
|
||||
set +e
|
||||
set -o pipefail
|
||||
echo 'GH_API_RES<<EOF'
|
||||
curl --fail-with-body --no-progress-meter -L -X PATCH \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
${{ fromJson(env.TARGET_REPO_PR).url }} \
|
||||
-d '{ "state": "closed" }'
|
||||
RES="$?"
|
||||
echo EOF
|
||||
} >> "${GITHUB_ENV}"
|
||||
exit $RES
|
||||
|
||||
- name: Delete the associated branch in the data repo
|
||||
run: |
|
||||
{
|
||||
set +e
|
||||
set -o pipefail
|
||||
echo 'GH_API_RES<<EOF'
|
||||
curl --fail-with-body --no-progress-meter -L -X DELETE \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
${{ env.TARGET_REPO_API_URL }}/git/refs/heads/${{ env.PR_BRANCH }}
|
||||
RES="$?"
|
||||
echo EOF
|
||||
} >> "${GITHUB_ENV}"
|
||||
exit $RES
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() && job.status == 'failure' }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Failed to auto-${{ env.CLOSE_TYPE }} PR on data repo: ${{ fromJson(env.GH_API_RES || env.TARGET_REPO_PR).message }}"
|
||||
@@ -1,5 +1,5 @@
|
||||
# Run all benchmarks on an RTX 4090 machine and return parsed results to Slab CI bot.
|
||||
name: TFHE Cuda Backend - 4090 full benchmarks
|
||||
# Run benchmarks on an RTX 4090 machine and return parsed results to Slab CI bot.
|
||||
name: TFHE Cuda Backend - 4090 benchmarks
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
@@ -11,6 +11,7 @@ env:
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
FAST_BENCH: TRUE
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
@@ -23,8 +24,10 @@ on:
|
||||
|
||||
jobs:
|
||||
cuda-integer-benchmarks:
|
||||
name: Cuda integer benchmarks for all operations flavor (RTX 4090)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
|
||||
name: Cuda integer benchmarks (RTX 4090)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' ||
|
||||
github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs' ||
|
||||
contains(github.event.label.name, '4090_bench') }}
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_cuda_integer_bench
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
@@ -33,9 +36,6 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: [integer, integer_multi_bit]
|
||||
op_flavor: [default, unchecked]
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
@@ -50,9 +50,10 @@ jobs:
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "FAST_BENCH=TRUE" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -65,7 +66,7 @@ jobs:
|
||||
|
||||
- name: Run integer benchmarks
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
|
||||
make BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -81,9 +82,9 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
name: ${{ github.sha }}_integer_multi_bit_gpu_default
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
@@ -133,7 +134,7 @@ jobs:
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -144,7 +145,7 @@ jobs:
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Run integer benchmarks
|
||||
- name: Run core crypto benchmarks
|
||||
run: |
|
||||
make bench_pbs_gpu
|
||||
make bench_ks_gpu
|
||||
@@ -163,7 +164,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -39,7 +39,7 @@ jobs:
|
||||
persist-credentials: 'false'
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
199
.github/workflows/gpu_fast_h100_tests.yml
vendored
Normal file
199
.github/workflows/gpu_fast_h100_tests.yml
vendored
Normal file
@@ -0,0 +1,199 @@
|
||||
# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
|
||||
name: TFHE Cuda Backend - Fast tests on H100
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
gpu:
|
||||
- tfhe/Cargo.toml
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**.md'
|
||||
- Makefile
|
||||
- '.github/workflows/gpu_fast_h100_tests.yml'
|
||||
- scripts/**
|
||||
- ci/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-h100-tests)
|
||||
needs: should-run
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
cuda-tests-linux:
|
||||
name: CUDA H100 tests
|
||||
needs: [ should-run, setup-instance ]
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run core crypto and internal CUDA backend tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
|
||||
BIG_TESTS_INSTANCE=TRUE make test_cuda_backend
|
||||
|
||||
- name: Run user docs tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
|
||||
|
||||
- name: Test C API
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
|
||||
|
||||
- name: Run High Level API Tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
|
||||
SLACK_MESSAGE: "Fast H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-h100-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
@@ -11,6 +11,7 @@ env:
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
@@ -18,26 +19,64 @@ on:
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
gpu:
|
||||
- tfhe/Cargo.toml
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**.md'
|
||||
- '.github/workflows/gpu_fast_tests.yml'
|
||||
- Makefile
|
||||
- scripts/**
|
||||
- ci/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-tests)
|
||||
needs: should-run
|
||||
if: github.event_name != 'pull_request' ||
|
||||
needs.should-run.outputs.gpu_test == 'true'
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
backend: hyperstack
|
||||
profile: gpu-test
|
||||
|
||||
cuda-tests-linux:
|
||||
name: CUDA tests
|
||||
needs: setup-instance
|
||||
needs: [ should-run, setup-instance ]
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
@@ -49,11 +88,23 @@ jobs:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
@@ -64,7 +115,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -87,6 +138,10 @@ jobs:
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run core crypto and internal CUDA backend tests
|
||||
run: |
|
||||
make test_core_crypto_gpu
|
||||
@@ -104,13 +159,18 @@ jobs:
|
||||
run: |
|
||||
make test_high_level_api_gpu
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "CUDA AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
|
||||
SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-tests)
|
||||
@@ -120,7 +180,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -11,33 +11,74 @@ env:
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
gpu:
|
||||
- tfhe/Cargo.toml
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**.md'
|
||||
- Makefile
|
||||
- '.github/workflows/**_multi_gpu_tests.yml'
|
||||
- scripts/**
|
||||
- ci/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-tests-multi-gpu)
|
||||
needs: should-run
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
backend: hyperstack
|
||||
profile: multi-gpu-test
|
||||
|
||||
cuda-tests-linux:
|
||||
name: CUDA multi-GPU tests
|
||||
needs: [ setup-instance ]
|
||||
needs: [ should-run, setup-instance ]
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
@@ -49,20 +90,34 @@ jobs:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -85,29 +140,39 @@ jobs:
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run core crypto, integer and internal CUDA backend tests
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
# No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
|
||||
- name: Run multi-bit CUDA integer tests
|
||||
run: |
|
||||
make test_gpu
|
||||
BIG_TESTS_INSTANCE=TRUE make test_integer_multi_bit_gpu_ci
|
||||
|
||||
- name: Run user docs tests
|
||||
run: |
|
||||
make test_user_doc_gpu
|
||||
BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
|
||||
|
||||
- name: Test C API
|
||||
run: |
|
||||
make test_c_api_gpu
|
||||
BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
|
||||
|
||||
- name: Run High Level API Tests
|
||||
run: |
|
||||
make test_high_level_api_gpu
|
||||
BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "CUDA AWS multi-GPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
|
||||
SLACK_MESSAGE: "Multi-GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-tests-multi-gpu)
|
||||
@@ -117,7 +182,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -24,7 +24,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -62,7 +62,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -109,7 +109,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -1,5 +1,5 @@
|
||||
# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
|
||||
name: TFHE Cuda Backend - Full tests on H100
|
||||
# Signed integer GPU tests on an H100 VM on hyperstack
|
||||
name: TFHE Cuda Backend - Signed integer tests on H100
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
@@ -11,22 +11,61 @@ env:
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
gpu:
|
||||
- tfhe/Cargo.toml
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**.md'
|
||||
- Makefile
|
||||
- '.github/workflows/gpu_signed_integer_h100_tests.yml'
|
||||
- scripts/**
|
||||
- ci/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-h100-tests)
|
||||
needs: should-run
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -36,8 +75,10 @@ jobs:
|
||||
profile: single-h100
|
||||
|
||||
cuda-tests-linux:
|
||||
name: CUDA H100 tests
|
||||
needs: [ setup-instance ]
|
||||
name: CUDA H100 signed integer tests
|
||||
needs: [ should-run, setup-instance ]
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
@@ -52,22 +93,13 @@ jobs:
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.1
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install ca-certificates curl
|
||||
sudo install -m 0755 -d /etc/apt/keyrings
|
||||
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
|
||||
sudo chmod a+r /etc/apt/keyrings/docker.asc
|
||||
echo \
|
||||
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
|
||||
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
|
||||
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
@@ -83,7 +115,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -106,31 +138,23 @@ jobs:
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name:
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run core crypto, integer and internal CUDA backend tests
|
||||
- name: Run signed integer tests
|
||||
run: |
|
||||
make test_gpu
|
||||
BIG_TESTS_INSTANCE=TRUE make test_signed_integer_gpu_ci
|
||||
|
||||
- name: Run user docs tests
|
||||
- name: Run signed integer multi-bit tests
|
||||
run: |
|
||||
make test_user_doc_gpu
|
||||
|
||||
- name: Test C API
|
||||
run: |
|
||||
make test_c_api_gpu
|
||||
|
||||
- name: Run High Level API Tests
|
||||
run: |
|
||||
make test_high_level_api_gpu
|
||||
BIG_TESTS_INSTANCE=TRUE make test_signed_integer_multi_bit_gpu_ci
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
@@ -147,7 +171,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -11,33 +11,82 @@ env:
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
FAST_TESTS: TRUE
|
||||
NIGHTLY_TESTS: FALSE
|
||||
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types:
|
||||
- opened
|
||||
- synchronize
|
||||
- labeled
|
||||
schedule:
|
||||
# Nightly tests @ 1AM after each work day
|
||||
- cron: "0 1 * * MON-FRI"
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
gpu:
|
||||
- tfhe/Cargo.toml
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**.md'
|
||||
- '.github/workflows/gpu_signed_integer_tests.yml'
|
||||
- Makefile
|
||||
- scripts/**
|
||||
- ci/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-signed-integer-tests)
|
||||
runs-on: ubuntu-latest
|
||||
needs: should-run
|
||||
if: (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
github.event_name == 'workflow_dispatch' ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true')
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
backend: hyperstack
|
||||
profile: gpu-test
|
||||
|
||||
cuda-signed-integer-tests:
|
||||
name: CUDA signed integer tests
|
||||
needs: setup-instance
|
||||
needs: [ should-run, setup-instance ]
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
@@ -49,20 +98,34 @@ jobs:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -85,21 +148,34 @@ jobs:
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run signed integer tests
|
||||
- name: Should run nightly tests
|
||||
if: github.event_name == 'schedule'
|
||||
run: |
|
||||
make test_signed_integer_gpu_ci
|
||||
{
|
||||
echo "FAST_TESTS=FALSE";
|
||||
echo "NIGHTLY_TESTS=TRUE";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run signed integer multi-bit tests
|
||||
run: |
|
||||
make test_signed_integer_multi_bit_gpu_ci
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-signed-integer-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-signed-integer-tests.result != 'skipped' }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "CUDA AWS signed integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_COLOR: ${{ needs.cuda-signed-integer-tests.result }}
|
||||
SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-signed-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-tests)
|
||||
@@ -109,7 +185,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
188
.github/workflows/gpu_unsigned_integer_h100_tests.yml
vendored
Normal file
188
.github/workflows/gpu_unsigned_integer_h100_tests.yml
vendored
Normal file
@@ -0,0 +1,188 @@
|
||||
# Test unsigned integers on an H100 VM on hyperstack
|
||||
name: TFHE Cuda Backend - Unsigned integer tests on H100
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
gpu:
|
||||
- tfhe/Cargo.toml
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**.md'
|
||||
- Makefile
|
||||
- '.github/workflows/gpu_unsigned_integer_tests.yml'
|
||||
- scripts/**
|
||||
- ci/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-h100-tests)
|
||||
needs: should-run
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
cuda-tests-linux:
|
||||
name: CUDA H100 unsigned integer tests
|
||||
needs: [ should-run, setup-instance ]
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run unsigned integer tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_gpu_ci
|
||||
|
||||
- name: Run unsigned integer multi-bit tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_multi_bit_gpu_ci
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
|
||||
SLACK_MESSAGE: "Unsigned integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-h100-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
@@ -11,33 +11,81 @@ env:
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
FAST_TESTS: TRUE
|
||||
NIGHTLY_TESTS: FALSE
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types:
|
||||
- opened
|
||||
- synchronize
|
||||
- labeled
|
||||
schedule:
|
||||
# Nightly tests @ 1AM after each work day
|
||||
- cron: "0 1 * * MON-FRI"
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
gpu:
|
||||
- tfhe/Cargo.toml
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**.md'
|
||||
- '.github/workflows/gpu_unsigned_integer_tests.yml'
|
||||
- Makefile
|
||||
- scripts/**
|
||||
- ci/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-unsigned-integer-tests)
|
||||
needs: should-run
|
||||
if: (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
github.event_name == 'workflow_dispatch' ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
backend: hyperstack
|
||||
profile: gpu-test
|
||||
|
||||
cuda-unsigned-integer-tests:
|
||||
name: CUDA unsigned integer tests
|
||||
needs: setup-instance
|
||||
needs: [ should-run, setup-instance ]
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
@@ -49,20 +97,32 @@ jobs:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -85,21 +145,34 @@ jobs:
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run unsigned integer tests
|
||||
- name: Should run nightly tests
|
||||
if: github.event_name == 'schedule'
|
||||
run: |
|
||||
make test_unsigned_integer_gpu_ci
|
||||
{
|
||||
echo "FAST_TESTS=FALSE";
|
||||
echo "NIGHTLY_TESTS=TRUE";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run unsigned integer multi-bit tests
|
||||
run: |
|
||||
make test_unsigned_integer_multi_bit_gpu_ci
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-unsigned-integer-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-unsigned-integer-tests.result != 'skipped' }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "CUDA AWS unsigned integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_COLOR: ${{ needs.cuda-unsigned-integer-tests.result }}
|
||||
SLACK_MESSAGE: "Unsigned integer GPU tests finished with status: ${{ needs.cuda-unsigned-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-tests)
|
||||
@@ -109,7 +182,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
130
.github/workflows/integer_benchmark.yml
vendored
130
.github/workflows/integer_benchmark.yml
vendored
@@ -1,130 +0,0 @@
|
||||
# Run integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: Integer benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-integer-benchmarks:
|
||||
name: Execute integer benchmarks in EC2
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Get benchmark date
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make FAST_BENCH=TRUE bench_integer
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
|
||||
COMMIT_HASH="$(git describe --tags --dirty)"
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
@@ -1,28 +1,20 @@
|
||||
# Run all integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: Integer full benchmarks
|
||||
name: Integer benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
user_inputs:
|
||||
description: "Type of benchmarks to run"
|
||||
type: string
|
||||
default: "weekly_benchmarks"
|
||||
all_precisions:
|
||||
description: "Run all precisions"
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
- cron: '0 1 * * 6'
|
||||
# Quarterly benchmarks will be triggered right before end of quarter, the 25th of the current month at 4a.m.
|
||||
# These benchmarks are far longer to execute hence the reason to run them only four time a year.
|
||||
- cron: '0 4 25 MAR,JUN,SEP,DEC *'
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
@@ -30,21 +22,29 @@ env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
FAST_BENCH: TRUE
|
||||
|
||||
jobs:
|
||||
prepare-matrix:
|
||||
name: Prepare operations matrix
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'schedule' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
op_flavor: ${{ steps.set_op_flavor.outputs.op_flavor }}
|
||||
steps:
|
||||
- name: Weekly benchmarks
|
||||
if: ${{ github.event.inputs.user_inputs == 'weekly_benchmarks' }}
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
github.event.schedule == '0 1 * * 6'
|
||||
run: |
|
||||
echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Quarterly benchmarks
|
||||
if: ${{ github.event.inputs.user_inputs == 'quarterly_benchmarks' }}
|
||||
if: github.event.schedule == '0 4 25 MAR,JUN,SEP,DEC *'
|
||||
run: |
|
||||
echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
@@ -53,11 +53,31 @@ jobs:
|
||||
run: |
|
||||
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (integer-benchmarks)
|
||||
needs: prepare-matrix
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
profile: bench
|
||||
|
||||
integer-benchmarks:
|
||||
name: Execute integer benchmarks for all operations flavor
|
||||
needs: prepare-matrix
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
needs: [ prepare-matrix, setup-instance ]
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
continue-on-error: true
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
strategy:
|
||||
@@ -66,13 +86,6 @@ jobs:
|
||||
command: [ integer, integer_multi_bit]
|
||||
op_flavor: ${{ fromJson(needs.prepare-matrix.outputs.op_flavor) }}
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
@@ -92,7 +105,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -103,6 +116,11 @@ jobs:
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Should run benchmarks with all precisions
|
||||
if: inputs.all_precisions
|
||||
run: |
|
||||
echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}
|
||||
@@ -111,7 +129,7 @@ jobs:
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--hardware "hpc7a.96xlarge" \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
@@ -121,7 +139,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -140,19 +158,34 @@ jobs:
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
slack-notification:
|
||||
name: Slack Notification
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ failure() }}
|
||||
needs: integer-benchmarks
|
||||
steps:
|
||||
- name: Notify
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (integer-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, integer-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (integer-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
20
.github/workflows/integer_gpu_benchmark.yml
vendored
20
.github/workflows/integer_gpu_benchmark.yml
vendored
@@ -23,14 +23,14 @@ jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-integer-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'push' ||
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -53,7 +53,7 @@ jobs:
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.1
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
@@ -86,7 +86,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -110,6 +110,10 @@ jobs:
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu
|
||||
@@ -120,7 +124,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -140,7 +144,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -170,7 +174,7 @@ jobs:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-integer-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
if: ${{ always() && needs.cuda-integer-benchmarks.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
@@ -187,7 +191,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
14
.github/workflows/integer_gpu_full_benchmark.yml
vendored
14
.github/workflows/integer_gpu_full_benchmark.yml
vendored
@@ -29,7 +29,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -57,7 +57,7 @@ jobs:
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.1
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
@@ -90,7 +90,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -121,6 +121,10 @@ jobs:
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
|
||||
@@ -140,7 +144,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -180,7 +184,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
130
.github/workflows/integer_multi_bit_benchmark.yml
vendored
130
.github/workflows/integer_multi_bit_benchmark.yml
vendored
@@ -1,130 +0,0 @@
|
||||
# Run integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: Integer Multi-bit benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-integer-benchmarks:
|
||||
name: Execute integer multi-bit benchmarks in EC2
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Get benchmark date
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Run multi-bit benchmarks with AVX512
|
||||
run: |
|
||||
make FAST_BENCH=TRUE bench_integer_multi_bit
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
|
||||
COMMIT_HASH="$(git describe --tags --dirty)"
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
@@ -4,10 +4,14 @@ name: Integer GPU Multi-bit benchmarks
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
full_benchmark:
|
||||
all_precisions:
|
||||
description: "Run all precisions"
|
||||
type: boolean
|
||||
default: false
|
||||
fast_default:
|
||||
description: "Run only deduplicated default operations without scalar variants"
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
@@ -25,6 +29,7 @@ env:
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
FAST_BENCH: TRUE
|
||||
BENCH_OP_FLAVOR: default
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
@@ -37,7 +42,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -61,7 +66,7 @@ jobs:
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.1
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
@@ -94,7 +99,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -118,14 +123,23 @@ jobs:
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Should run full benchmarks
|
||||
if: inputs.full_benchmark
|
||||
- name: Should run benchmarks with all precisions
|
||||
if: inputs.all_precisions
|
||||
run: |
|
||||
echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Should run fast subset benchmarks
|
||||
if: inputs.fast_default
|
||||
run: |
|
||||
echo "BENCH_OP_FLAVOR=fast_default" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run multi-bit benchmarks with AVX512
|
||||
run: |
|
||||
make FAST_BENCH=${{ env.FAST_BENCH }} BENCH_OP_FLAVOR=default bench_unsigned_integer_multi_bit_gpu
|
||||
make bench_unsigned_integer_multi_bit_gpu
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
@@ -133,7 +147,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -153,7 +167,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -201,7 +215,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -4,10 +4,14 @@ name: Integer multi GPU Multi-bit benchmarks
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
full_benchmark:
|
||||
all_precisions:
|
||||
description: "Run all precisions"
|
||||
type: boolean
|
||||
default: false
|
||||
fast_default:
|
||||
description: "Run only deduplicated default operations without scalar variants"
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
@@ -24,25 +28,28 @@ env:
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
FAST_BENCH: TRUE
|
||||
BENCH_OP_FLAVOR: default
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-integer-multi-bit-multi-gpu-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
|
||||
if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
github.event_name == 'workflow_dispatch' }}
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
profile: multi-gpu-test
|
||||
backend: hyperstack
|
||||
profile: multi-h100
|
||||
|
||||
cuda-integer-multi-bit-multi-gpu-benchmarks:
|
||||
name: Execute multi GPU integer multi-bit benchmarks
|
||||
@@ -57,11 +64,23 @@ jobs:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
@@ -81,7 +100,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -112,20 +131,29 @@ jobs:
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Should run full benchmarks
|
||||
if: inputs.full_benchmark
|
||||
- name: Should run benchmarks with all precisions
|
||||
if: inputs.all_precisions
|
||||
run: |
|
||||
echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Should run fast subset benchmarks
|
||||
if: inputs.fast_default
|
||||
run: |
|
||||
echo "BENCH_OP_FLAVOR=fast_default" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run multi-bit benchmarks with AVX512
|
||||
run: |
|
||||
make FAST_BENCH=${{ env.FAST_BENCH }} BENCH_OP_FLAVOR=default bench_unsigned_integer_multi_bit_gpu
|
||||
make bench_unsigned_integer_multi_bit_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "p3.8xlarge" \
|
||||
--hardware "n3-H100x8" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
@@ -136,7 +164,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -176,7 +204,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -29,14 +29,14 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
profile: multi-gpu-test
|
||||
backend: hyperstack
|
||||
profile: multi-h100
|
||||
|
||||
cuda-integer-full-multi-gpu-benchmarks:
|
||||
name: Execute multi GPU integer benchmarks for all operations flavor
|
||||
@@ -54,11 +54,23 @@ jobs:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
@@ -78,7 +90,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -109,6 +121,10 @@ jobs:
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
|
||||
@@ -117,7 +133,7 @@ jobs:
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "p3.8xlarge" \
|
||||
--hardware "n3-H100x8" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
@@ -128,7 +144,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -168,7 +184,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
2
.github/workflows/m1_tests.yml
vendored
2
.github/workflows/m1_tests.yml
vendored
@@ -39,7 +39,7 @@ jobs:
|
||||
persist-credentials: 'false'
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
66
.github/workflows/make_release.yml
vendored
66
.github/workflows/make_release.yml
vendored
@@ -30,20 +30,62 @@ env:
|
||||
NPM_TAG: ""
|
||||
|
||||
jobs:
|
||||
publish_release:
|
||||
name: Publish Release
|
||||
package:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
hash: ${{ steps.hash.outputs.hash }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Prepare package
|
||||
run: |
|
||||
cargo package -p tfhe
|
||||
- uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
|
||||
with:
|
||||
name: crate
|
||||
path: target/package/*.crate
|
||||
- name: generate hash
|
||||
id: hash
|
||||
run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
provenance:
|
||||
if: ${{ !inputs.dry_run }}
|
||||
needs: [package]
|
||||
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.0.0
|
||||
permissions:
|
||||
# Needed to detect the GitHub Actions environment
|
||||
actions: read
|
||||
# Needed to create the provenance via GitHub OIDC
|
||||
id-token: write
|
||||
# Needed to upload assets/artifacts
|
||||
contents: write
|
||||
with:
|
||||
# SHA-256 hashes of the Crate package.
|
||||
base64-subjects: ${{ needs.package.outputs.hash }}
|
||||
|
||||
publish_release:
|
||||
name: Publish Release
|
||||
needs: [package] # for comparing hashes
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Create NPM version tag
|
||||
if: ${{ inputs.npm_latest_tag }}
|
||||
run: |
|
||||
echo "NPM_TAG=latest" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Download artifact
|
||||
uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
|
||||
with:
|
||||
name: crate
|
||||
path: target/package
|
||||
- name: Publish crate.io package
|
||||
if: ${{ inputs.push_to_crates }}
|
||||
env:
|
||||
@@ -52,6 +94,22 @@ jobs:
|
||||
run: |
|
||||
cargo publish -p tfhe --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
|
||||
|
||||
- name: Generate hash
|
||||
id: published_hash
|
||||
run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Slack notification (hashes comparison)
|
||||
if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: failure
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "SLSA tfhe crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
- name: Build web package
|
||||
if: ${{ inputs.push_web_package }}
|
||||
run: |
|
||||
@@ -65,6 +123,7 @@ jobs:
|
||||
package: tfhe/pkg/package.json
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
tag: ${{ env.NPM_TAG }}
|
||||
provenance: true
|
||||
|
||||
- name: Build Node package
|
||||
if: ${{ inputs.push_node_package }}
|
||||
@@ -82,6 +141,7 @@ jobs:
|
||||
package: tfhe/pkg/package.json
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
tag: ${{ env.NPM_TAG }}
|
||||
provenance: true
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
# Publish new release of tfhe-rs on various platform.
|
||||
name: Publish concrete-csprng release
|
||||
|
||||
on:
|
||||
@@ -37,6 +36,6 @@ jobs:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "concrete-csprng release failed: (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "concrete-csprng release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
36
.github/workflows/make_release_concrete_tfhe_versionable.yml
vendored
Normal file
36
.github/workflows/make_release_concrete_tfhe_versionable.yml
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
name: Publish tfhe-versionable release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
|
||||
jobs:
|
||||
publish_release:
|
||||
name: Publish tfhe-versionable Release
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Publish crate.io package
|
||||
env:
|
||||
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
|
||||
run: |
|
||||
cargo publish -p tfhe-versionable-derive --token ${{ env.CRATES_TOKEN }}
|
||||
cargo publish -p tfhe-versionable --token ${{ env.CRATES_TOKEN }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "tfhe-versionable release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
6
.github/workflows/make_release_cuda.yml
vendored
6
.github/workflows/make_release_cuda.yml
vendored
@@ -29,7 +29,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -112,7 +112,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/parameters_check.yml
vendored
4
.github/workflows/parameters_check.yml
vendored
@@ -14,7 +14,7 @@ on:
|
||||
|
||||
jobs:
|
||||
params-curves-security-check:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: large_ubuntu_16
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
@@ -24,7 +24,7 @@ jobs:
|
||||
with:
|
||||
repository: malb/lattice-estimator
|
||||
path: lattice_estimator
|
||||
ref: '53508253629d3b5d31a2ad110e85dc69391ccb95'
|
||||
ref: 'e80ec6bbbba212428b0e92d0467c18629cf9ed67'
|
||||
|
||||
- name: Install Sage
|
||||
run: |
|
||||
|
||||
128
.github/workflows/shortint_benchmark.yml
vendored
128
.github/workflows/shortint_benchmark.yml
vendored
@@ -1,128 +0,0 @@
|
||||
# Run shortint benchmarks on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: Shortint benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-shortint-benchmarks:
|
||||
name: Execute shortint benchmarks in EC2
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Get benchmark date
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make bench_shortint
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
|
||||
COMMIT_HASH="$(git describe --tags --dirty)"
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Measure key sizes
|
||||
run: |
|
||||
make measure_shortint_key_sizes
|
||||
|
||||
- name: Parse key sizes results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py tfhe/shortint_key_sizes.csv ${{ env.RESULTS_FILENAME }} \
|
||||
--key-sizes \
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_shortint
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Shortint benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
@@ -3,30 +3,13 @@ name: Shortint full benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
# This input is not used in this workflow but still mandatory since a calling workflow could
|
||||
# use it. If a triggering command include a user_inputs field, then the triggered workflow
|
||||
# must include this very input, otherwise the workflow won't be called.
|
||||
# See start_full_benchmarks.yml as example.
|
||||
user_inputs:
|
||||
description: "Type of benchmarks to run"
|
||||
type: string
|
||||
default: "weekly_benchmarks"
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
- cron: '0 1 * * 6'
|
||||
# Quarterly benchmarks will be triggered right before end of quarter, the 25th of the current month at 4a.m.
|
||||
# These benchmarks are far longer to execute hence the reason to run them only four time a year.
|
||||
- cron: '0 4 25 MAR,JUN,SEP,DEC *'
|
||||
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
@@ -34,24 +17,67 @@ env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
jobs:
|
||||
prepare-matrix:
|
||||
name: Prepare operations matrix
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'schedule' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
op_flavor: ${{ steps.set_op_flavor.outputs.op_flavor }}
|
||||
steps:
|
||||
- name: Weekly benchmarks
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
github.event.schedule == '0 1 * * 6'
|
||||
run: |
|
||||
echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Quarterly benchmarks
|
||||
if: github.event.schedule == '0 4 25 MAR,JUN,SEP,DEC *'
|
||||
run: |
|
||||
echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set operation flavor output
|
||||
id: set_op_flavor
|
||||
run: |
|
||||
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (shortint-benchmarks)
|
||||
needs: prepare-matrix
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
profile: bench
|
||||
|
||||
shortint-benchmarks:
|
||||
name: Execute shortint benchmarks for all operations flavor
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
needs: [ prepare-matrix, setup-instance ]
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
continue-on-error: true
|
||||
strategy:
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
op_flavor: [ default, smart, unchecked ]
|
||||
op_flavor: ${{ fromJson(needs.prepare-matrix.outputs.op_flavor) }}
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
@@ -71,7 +97,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -92,7 +118,7 @@ jobs:
|
||||
COMMIT_HASH="$(git describe --tags --dirty)"
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--hardware "hpc7a.96xlarge" \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
@@ -115,7 +141,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
with:
|
||||
name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -134,19 +160,34 @@ jobs:
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
slack-notification:
|
||||
name: Slack Notification
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ failure() }}
|
||||
needs: shortint-benchmarks
|
||||
steps:
|
||||
- name: Notify
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Shortint full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (shortint-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, shortint-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (shortint-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
130
.github/workflows/signed_integer_benchmark.yml
vendored
130
.github/workflows/signed_integer_benchmark.yml
vendored
@@ -1,130 +0,0 @@
|
||||
# Run signed integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: Signed Integer benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-integer-benchmarks:
|
||||
name: Execute signed integer benchmarks in EC2
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Get benchmark date
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make FAST_BENCH=TRUE bench_signed_integer
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
|
||||
COMMIT_HASH="$(git describe --tags --dirty)"
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Signed integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
191
.github/workflows/signed_integer_cpu_benchmark.yml
vendored
Normal file
191
.github/workflows/signed_integer_cpu_benchmark.yml
vendored
Normal file
@@ -0,0 +1,191 @@
|
||||
# Run all signed integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: Signed Integer full benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
all_precisions:
|
||||
description: "Run all precisions"
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
- cron: '0 1 * * 6'
|
||||
# Quarterly benchmarks will be triggered right before end of quarter, the 25th of the current month at 4a.m.
|
||||
# These benchmarks are far longer to execute hence the reason to run them only four time a year.
|
||||
- cron: '0 4 25 MAR,JUN,SEP,DEC *'
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
FAST_BENCH: TRUE
|
||||
|
||||
jobs:
|
||||
prepare-matrix:
|
||||
name: Prepare operations matrix
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'schedule' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
op_flavor: ${{ steps.set_op_flavor.outputs.op_flavor }}
|
||||
steps:
|
||||
- name: Weekly benchmarks
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
github.event.schedule == '0 1 * * 6'
|
||||
run: |
|
||||
echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Quarterly benchmarks
|
||||
if: github.event.schedule == '0 4 25 MAR,JUN,SEP,DEC *'
|
||||
run: |
|
||||
echo "OP_FLAVOR=[\"default\", \"unchecked\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set operation flavor output
|
||||
id: set_op_flavor
|
||||
run: |
|
||||
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (signed-integer-benchmarks)
|
||||
needs: prepare-matrix
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
profile: bench
|
||||
|
||||
signed-integer-benchmarks:
|
||||
name: Execute signed integer benchmarks for all operations flavor
|
||||
needs: [ prepare-matrix, setup-instance ]
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
continue-on-error: true
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
strategy:
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: [ integer, integer_multi_bit ]
|
||||
op_flavor: [ default, unchecked ]
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Should run benchmarks with all precisions
|
||||
if: inputs.all_precisions
|
||||
run: |
|
||||
echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_signed_${{ matrix.command }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "hpc7a.96xlarge" \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Signed integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (integer-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, signed-integer-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (signed-integer-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
136
.github/workflows/signed_integer_full_benchmark.yml
vendored
136
.github/workflows/signed_integer_full_benchmark.yml
vendored
@@ -1,136 +0,0 @@
|
||||
# Run all signed integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: Signed Integer full benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
user_inputs:
|
||||
description: "Type of benchmarks to run"
|
||||
type: string
|
||||
default: "weekly_benchmarks"
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
integer-benchmarks:
|
||||
name: Execute signed integer benchmarks for all operations flavor
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
continue-on-error: true
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
strategy:
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: [ integer, integer_multi_bit ]
|
||||
op_flavor: [ default, unchecked ]
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_signed_${{ matrix.command }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
slack-notification:
|
||||
name: Slack Notification
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ failure() }}
|
||||
needs: integer-benchmarks
|
||||
steps:
|
||||
- name: Notify
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Signed integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
@@ -1,130 +0,0 @@
|
||||
# Run signed integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: Signed Integer Multi-bit benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-integer-benchmarks:
|
||||
name: Execute signed integer multi-bit benchmarks in EC2
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Get benchmark date
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Run multi-bit benchmarks with AVX512
|
||||
run: |
|
||||
make FAST_BENCH=TRUE bench_signed_integer_multi_bit
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
|
||||
COMMIT_HASH="$(git describe --tags --dirty)"
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Signed integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
123
.github/workflows/start_benchmarks.yml
vendored
123
.github/workflows/start_benchmarks.yml
vendored
@@ -1,123 +0,0 @@
|
||||
# Start all benchmark jobs on Slab CI bot.
|
||||
name: Start all benchmarks
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- "main"
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
# The input name must be the name of the slab command to launch
|
||||
boolean_bench:
|
||||
description: "Run Boolean benches"
|
||||
type: boolean
|
||||
default: true
|
||||
shortint_bench:
|
||||
description: "Run shortint benches"
|
||||
type: boolean
|
||||
default: true
|
||||
integer_bench:
|
||||
description: "Run integer benches"
|
||||
type: boolean
|
||||
default: true
|
||||
signed_integer_bench:
|
||||
description: "Run signed integer benches"
|
||||
type: boolean
|
||||
default: true
|
||||
integer_multi_bit_bench:
|
||||
description: "Run integer multi bit benches"
|
||||
type: boolean
|
||||
default: true
|
||||
signed_integer_multi_bit_bench:
|
||||
description: "Run signed integer multi bit benches"
|
||||
type: boolean
|
||||
default: true
|
||||
core_crypto_bench:
|
||||
description: "Run core crypto benches"
|
||||
type: boolean
|
||||
default: true
|
||||
|
||||
jobs:
|
||||
start-benchmarks:
|
||||
if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
|
||||
strategy:
|
||||
matrix:
|
||||
command: [ boolean_bench, shortint_bench,
|
||||
integer_bench, integer_multi_bit_bench,
|
||||
signed_integer_bench, signed_integer_multi_bit_bench,
|
||||
core_crypto_bench ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@eaf854ef0c266753e1abec356dcf17d92695b251
|
||||
with:
|
||||
files_yaml: |
|
||||
common_benches:
|
||||
- toolchain.txt
|
||||
- Makefile
|
||||
- ci/slab.toml
|
||||
- tfhe/Cargo.toml
|
||||
- tfhe/src/core_crypto/**
|
||||
- .github/workflows/start_benchmarks.yml
|
||||
boolean_bench:
|
||||
- tfhe/src/boolean/**
|
||||
- tfhe/benches/boolean/**
|
||||
- .github/workflows/boolean_benchmark.yml
|
||||
shortint_bench:
|
||||
- tfhe/src/shortint/**
|
||||
- tfhe/benches/shortint/**
|
||||
- .github/workflows/shortint_benchmark.yml
|
||||
integer_bench:
|
||||
- tfhe/src/shortint/**
|
||||
- tfhe/src/integer/**
|
||||
- tfhe/benches/integer/bench.rs
|
||||
- .github/workflows/integer_benchmark.yml
|
||||
integer_multi_bit_bench:
|
||||
- tfhe/src/shortint/**
|
||||
- tfhe/src/integer/**
|
||||
- tfhe/benches/integer/bench.rs
|
||||
- .github/workflows/integer_multi_bit_benchmark.yml
|
||||
signed_integer_bench:
|
||||
- tfhe/src/shortint/**
|
||||
- tfhe/src/integer/**
|
||||
- tfhe/benches/integer/signed_bench.rs
|
||||
- .github/workflows/signed_integer_benchmark.yml
|
||||
signed_integer_multi_bit_bench:
|
||||
- tfhe/src/shortint/**
|
||||
- tfhe/src/integer/**
|
||||
- tfhe/benches/integer/signed_bench.rs
|
||||
- .github/workflows/signed_integer_multi_bit_benchmark.yml
|
||||
core_crypto_bench:
|
||||
- tfhe/src/core_crypto/**
|
||||
- tfhe/benches/core_crypto/**
|
||||
- .github/workflows/core_crypto_benchmark.yml
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Start AWS job in Slab
|
||||
# If manually triggered check that the current bench has been requested
|
||||
# Otherwise if it's on push check that files relevant to benchmarks have changed
|
||||
if: (github.event_name == 'workflow_dispatch' && github.event.inputs[matrix.command] == 'true') || (github.event_name == 'push' && (steps.changed-files.outputs.common_benches_any_changed == 'true' || steps.changed-files.outputs[format('{0}_any_changed', matrix.command)] == 'true'))
|
||||
shell: bash
|
||||
run: |
|
||||
echo -n '{"command": "${{ matrix.command }}", "git_ref": "${{ github.ref }}", "sha": "${{ github.sha }}"}' > command.json
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh command.json '${{ secrets.JOB_SECRET }}')"
|
||||
curl -v -k \
|
||||
--fail-with-body \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: start_aws" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @command.json \
|
||||
${{ secrets.SLAB_URL }}
|
||||
66
.github/workflows/start_full_benchmarks.yml
vendored
66
.github/workflows/start_full_benchmarks.yml
vendored
@@ -1,66 +0,0 @@
|
||||
# Start all benchmark jobs, including full shortint and integer, on Slab CI bot.
|
||||
name: Start full suite benchmarks
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
- cron: '0 1 * * 6'
|
||||
# Quarterly benchmarks will be triggered right before end of quarter, the 25th of the current month at 4a.m.
|
||||
# These benchmarks are far longer to execute hence the reason to run them only four time a year.
|
||||
- cron: '0 4 25 MAR,JUN,SEP,DEC *'
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
benchmark_type:
|
||||
description: 'Benchmark type'
|
||||
required: true
|
||||
default: 'weekly'
|
||||
type: choice
|
||||
options:
|
||||
- weekly
|
||||
- quarterly
|
||||
|
||||
jobs:
|
||||
start-benchmarks:
|
||||
if: ${{ (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
|
||||
strategy:
|
||||
matrix:
|
||||
command: [ boolean_bench, shortint_full_bench,
|
||||
integer_full_bench, signed_integer_full_bench,
|
||||
core_crypto_bench, wasm_client_bench ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Set benchmarks type as weekly
|
||||
if: (github.event_name == 'workflow_dispatch' && inputs.benchmark_type == 'weekly') || github.event.schedule == '0 1 * * 6'
|
||||
run: |
|
||||
echo "BENCH_TYPE=weekly_benchmarks" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set benchmarks type as quarterly
|
||||
if: (github.event_name == 'workflow_dispatch' && inputs.benchmark_type == 'quarterly') || github.event.schedule == '0 4 25 MAR,JUN,SEP,DEC *'
|
||||
run: |
|
||||
echo "BENCH_TYPE=quarterly_benchmarks" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Start AWS job in Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo -n '{"command": "${{ matrix.command }}", "git_ref": "${{ github.ref }}", "sha": "${{ github.sha }}", "user_inputs": "${{ env.BENCH_TYPE }}"}' > command.json
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh command.json '${{ secrets.JOB_SECRET }}')"
|
||||
curl -v -k \
|
||||
--fail-with-body \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: start_aws" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @command.json \
|
||||
${{ secrets.SLAB_URL }}
|
||||
21
.github/workflows/wasm_client_benchmark.yml
vendored
21
.github/workflows/wasm_client_benchmark.yml
vendored
@@ -25,7 +25,8 @@ jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
((github.event_name == 'push' || github.event_name == 'schedule') && github.repository == 'zama-ai/tfhe-rs')
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
@@ -38,7 +39,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@eaf854ef0c266753e1abec356dcf17d92695b251
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -53,7 +54,8 @@ jobs:
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (wasm-client-benchmarks)
|
||||
if: github.event_name != 'push' ||
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.wasm_bench)
|
||||
needs: should-run
|
||||
runs-on: ubuntu-latest
|
||||
@@ -62,7 +64,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -73,9 +75,8 @@ jobs:
|
||||
|
||||
wasm-client-benchmarks:
|
||||
name: Execute WASM client benchmarks
|
||||
needs: [ should-run, setup-instance ]
|
||||
if: github.event_name != 'push' ||
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.wasm_bench)
|
||||
needs: setup-instance
|
||||
if: needs.setup-instance.result != 'skipped'
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
@@ -97,7 +98,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -129,7 +130,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
with:
|
||||
name: ${{ github.sha }}_wasm
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -171,7 +172,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
24
.github/workflows/zk_pke_benchmark.yml
vendored
24
.github/workflows/zk_pke_benchmark.yml
vendored
@@ -24,8 +24,8 @@ env:
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'push' ||
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
((github.event_name == 'push' || github.event_name == 'schedule') && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
zk_pok_changed: ${{ steps.changed-files.outputs.zk_pok_any_changed }}
|
||||
steps:
|
||||
@@ -36,7 +36,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@eaf854ef0c266753e1abec356dcf17d92695b251
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -55,7 +55,7 @@ jobs:
|
||||
name: Setup instance (pke-zk-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
needs: should-run
|
||||
if: github.event_name != 'push' ||
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'push' &&
|
||||
github.repository == 'zama-ai/tfhe-rs' &&
|
||||
@@ -65,7 +65,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -76,12 +76,10 @@ jobs:
|
||||
|
||||
pke-zk-benchmarks:
|
||||
name: Execute PKE ZK benchmarks
|
||||
if: github.event_name != 'push' ||
|
||||
((github.event_name == 'push' || github.event_name == 'schedule') &&
|
||||
needs.setup-instance.result != 'skipped')
|
||||
needs: [ should-run, setup-instance ]
|
||||
if: needs.setup-instance.result != 'skipped'
|
||||
needs: setup-instance
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
|
||||
group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
steps:
|
||||
@@ -104,7 +102,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -140,7 +138,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
with:
|
||||
name: ${{ github.sha }}_integer_zk
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -182,7 +180,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -7,6 +7,7 @@ target/
|
||||
# In case of symlinked keys
|
||||
/keys
|
||||
|
||||
**/*.rmeta
|
||||
**/Cargo.lock
|
||||
**/*.bin
|
||||
|
||||
|
||||
@@ -8,10 +8,13 @@ members = [
|
||||
"concrete-csprng",
|
||||
"backends/tfhe-cuda-backend",
|
||||
"utils/tfhe-versionable",
|
||||
"utils/tfhe-versionable-derive"
|
||||
"utils/tfhe-versionable-derive",
|
||||
]
|
||||
|
||||
exclude = [
|
||||
"tfhe/backward_compatibility_tests"
|
||||
"tfhe/backward_compatibility_tests",
|
||||
"utils/cargo-tfhe-lints-inner",
|
||||
"utils/cargo-tfhe-lints"
|
||||
]
|
||||
|
||||
[profile.bench]
|
||||
|
||||
97
Makefile
97
Makefile
@@ -16,21 +16,15 @@ GEN_KEY_CACHE_COVERAGE_ONLY?=FALSE
|
||||
PARSE_INTEGER_BENCH_CSV_FILE?=tfhe_rs_integer_benches.csv
|
||||
FAST_TESTS?=FALSE
|
||||
FAST_BENCH?=FALSE
|
||||
NIGHTLY_TESTS?=FALSE
|
||||
BENCH_OP_FLAVOR?=DEFAULT
|
||||
NODE_VERSION=20
|
||||
NODE_VERSION=22.6
|
||||
FORWARD_COMPAT?=OFF
|
||||
BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
|
||||
BACKWARD_COMPAT_DATA_DIR=tfhe-backward-compat-data
|
||||
# sed: -n, do not print input stream, -e means a script/expression
|
||||
# 1,/version/ indicates from the first line, to the line matching version at the start of the line
|
||||
# p indicates to print, so we keep only the start of the Cargo.toml until we hit the first version
|
||||
# entry which should be the version of tfhe
|
||||
TFHE_CURRENT_VERSION:=\
|
||||
$(shell sed -n -e '1,/^version/p' tfhe/Cargo.toml | \
|
||||
grep '^version[[:space:]]*=' | cut -d '=' -f 2 | xargs)
|
||||
# Cargo has a hard time distinguishing between our package from the workspace and a package that
|
||||
# could be a dependency, so we build an unambiguous spec here
|
||||
TFHE_SPEC:=tfhe@$(TFHE_CURRENT_VERSION)
|
||||
BACKWARD_COMPAT_DATA_BRANCH?=v0.1
|
||||
BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
|
||||
BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
|
||||
TFHE_SPEC:=tfhe
|
||||
# This is done to avoid forgetting it, we still precise the RUSTFLAGS in the commands to be able to
|
||||
# copy paste the command in the terminal and change them if required without forgetting the flags
|
||||
export RUSTFLAGS?=-C target-cpu=native
|
||||
@@ -117,7 +111,7 @@ install_cargo_nextest: install_rs_build_toolchain
|
||||
.PHONY: install_wasm_pack # Install wasm-pack to build JS packages
|
||||
install_wasm_pack: install_rs_build_toolchain
|
||||
@wasm-pack --version > /dev/null 2>&1 || \
|
||||
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install wasm-pack || \
|
||||
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked wasm-pack@0.13.0 || \
|
||||
( echo "Unable to install cargo wasm-pack, unknown error." && exit 1 )
|
||||
|
||||
.PHONY: install_node # Install last version of NodeJS via nvm
|
||||
@@ -147,6 +141,11 @@ install_tarpaulin: install_rs_build_toolchain
|
||||
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install cargo-tarpaulin --locked || \
|
||||
( echo "Unable to install cargo tarpaulin, unknown error." && exit 1 )
|
||||
|
||||
.PHONY: install_tfhe_lints # Install custom tfhe-rs lints
|
||||
install_tfhe_lints:
|
||||
(cd utils/cargo-tfhe-lints-inner && cargo install --path .) && \
|
||||
cd utils/cargo-tfhe-lints && cargo install --path .
|
||||
|
||||
.PHONY: check_linelint_installed # Check if linelint newline linter is installed
|
||||
check_linelint_installed:
|
||||
@printf "\n" | linelint - > /dev/null 2>&1 || \
|
||||
@@ -266,6 +265,17 @@ clippy: install_rs_check_toolchain
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_rustdoc # Run clippy lints on doctests enabling the boolean, shortint, integer and zk-pok
|
||||
clippy_rustdoc: install_rs_check_toolchain
|
||||
if [[ "$(OS)" != "Linux" && "$(OS)" != "Darwin" ]]; then \
|
||||
echo "WARNING: skipped clippy_rustdoc, unsupported OS $(OS)"; \
|
||||
exit 0; \
|
||||
fi && \
|
||||
CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --nocapture --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,zk-pok,pbs-stats \
|
||||
-p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
|
||||
clippy_c_api: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
@@ -306,18 +316,23 @@ clippy_zk_pok: install_rs_check_toolchain
|
||||
-p tfhe-zk-pok -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_all # Run all clippy targets
|
||||
clippy_all: clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets clippy_c_api \
|
||||
clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium
|
||||
clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
|
||||
clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium
|
||||
|
||||
.PHONY: clippy_fast # Run main clippy targets
|
||||
clippy_fast: clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core \
|
||||
clippy_concrete_csprng
|
||||
clippy_fast: clippy_rustdoc clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks \
|
||||
clippy_core clippy_concrete_csprng
|
||||
|
||||
.PHONY: clippy_cuda_backend # Run clippy lints on the tfhe-cuda-backend
|
||||
clippy_cuda_backend: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-cuda-backend -- --no-deps -D warnings
|
||||
|
||||
.PHONY: tfhe_lints # Run custom tfhe-rs lints
|
||||
tfhe_lints: install_tfhe_lints
|
||||
cd tfhe && RUSTFLAGS="$(RUSTFLAGS)" cargo tfhe-lints \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -- -D warnings
|
||||
|
||||
.PHONY: build_core # Build core_crypto without experimental features
|
||||
build_core: install_rs_build_toolchain install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
@@ -402,7 +417,8 @@ build_web_js_api_parallel: install_rs_check_toolchain install_wasm_pack
|
||||
RUSTFLAGS="$(WASM_RUSTFLAGS) -C target-feature=+atomics,+bulk-memory,+mutable-globals" rustup run $(RS_CHECK_TOOLCHAIN) \
|
||||
wasm-pack build --release --target=web \
|
||||
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,parallel-wasm-api,zk-pok \
|
||||
-Z build-std=panic_abort,std
|
||||
-Z build-std=panic_abort,std && \
|
||||
find pkg/snippets -type f -iname workerHelpers.worker.js -exec sed -i "s|from '..\/..\/..\/';|from '..\/..\/..\/tfhe.js';|" {} \;
|
||||
|
||||
.PHONY: build_node_js_api # Build the js API targeting nodejs
|
||||
build_node_js_api: install_rs_build_toolchain install_wasm_pack
|
||||
@@ -445,8 +461,8 @@ test_cuda_backend:
|
||||
mkdir -p "$(TFHECUDA_BUILD)" && \
|
||||
cd "$(TFHECUDA_BUILD)" && \
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON && \
|
||||
make -j "$(CPU_COUNT)" && \
|
||||
make test
|
||||
"$(MAKE)" -j "$(CPU_COUNT)" && \
|
||||
"$(MAKE)" test
|
||||
|
||||
.PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
|
||||
test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
|
||||
@@ -469,6 +485,7 @@ test_integer_gpu: install_rs_build_toolchain
|
||||
test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
|
||||
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)" --backend "gpu" \
|
||||
--tfhe-package "$(TFHE_SPEC)"
|
||||
@@ -477,6 +494,7 @@ test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
test_unsigned_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
|
||||
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)" --backend "gpu" \
|
||||
--unsigned-only --tfhe-package "$(TFHE_SPEC)"
|
||||
@@ -485,6 +503,7 @@ test_unsigned_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
test_signed_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
|
||||
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)" --backend "gpu" \
|
||||
--signed-only --tfhe-package "$(TFHE_SPEC)"
|
||||
@@ -493,6 +512,7 @@ test_signed_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
test_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
|
||||
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \
|
||||
--tfhe-package "$(TFHE_SPEC)"
|
||||
@@ -501,6 +521,7 @@ test_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
test_unsigned_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
|
||||
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \
|
||||
--unsigned-only --tfhe-package "$(TFHE_SPEC)"
|
||||
@@ -509,6 +530,7 @@ test_unsigned_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo
|
||||
test_signed_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
|
||||
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \
|
||||
--signed-only --tfhe-package "$(TFHE_SPEC)"
|
||||
@@ -575,6 +597,7 @@ test_shortint_cov: install_rs_check_toolchain install_tarpaulin
|
||||
test_integer_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
|
||||
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
|
||||
--tfhe-package "$(TFHE_SPEC)"
|
||||
@@ -583,6 +606,7 @@ test_integer_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
test_unsigned_integer_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
|
||||
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
|
||||
--unsigned-only --tfhe-package "$(TFHE_SPEC)"
|
||||
@@ -591,6 +615,7 @@ test_unsigned_integer_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
test_signed_integer_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
|
||||
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
|
||||
--signed-only --tfhe-package "$(TFHE_SPEC)"
|
||||
@@ -599,6 +624,7 @@ test_signed_integer_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
test_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
|
||||
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
|
||||
--tfhe-package "$(TFHE_SPEC)"
|
||||
@@ -607,6 +633,7 @@ test_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
test_unsigned_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
|
||||
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
|
||||
--unsigned-only --tfhe-package "$(TFHE_SPEC)"
|
||||
@@ -615,6 +642,7 @@ test_unsigned_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nex
|
||||
test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
|
||||
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
|
||||
--signed-only --tfhe-package "$(TFHE_SPEC)"
|
||||
@@ -708,14 +736,21 @@ test_versionable: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
-p tfhe-versionable
|
||||
|
||||
# The backward compat data repo holds historical binary data but also rust code to generate and load them.
|
||||
# Here we use the "patch" functionality of Cargo to make sure the repo used for the data is the same as the one used for the code.
|
||||
.PHONY: test_backward_compatibility_ci
|
||||
test_backward_compatibility_ci: install_rs_build_toolchain
|
||||
TFHE_BACKWARD_COMPAT_DATA_DIR="$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).path=\"tfhe/$(BACKWARD_COMPAT_DATA_DIR)\"" \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,integer -p $(TFHE_SPEC) test_backward_compatibility -- --nocapture
|
||||
|
||||
.PHONY: test_backward_compatibility # Same as test_backward_compatibility_ci but tries to clone the data repo first if needed
|
||||
test_backward_compatibility: tfhe/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci
|
||||
|
||||
.PHONY: backward_compat_branch # Prints the required backward compatibility branch
|
||||
backward_compat_branch:
|
||||
@echo "$(BACKWARD_COMPAT_DATA_BRANCH)"
|
||||
|
||||
.PHONY: doc # Build rust doc
|
||||
doc: install_rs_check_toolchain
|
||||
@# Even though we are not in docs.rs, this allows to "just" build the doc
|
||||
@@ -778,7 +813,7 @@ check_compile_tests_benches_gpu: install_rs_build_toolchain
|
||||
mkdir -p "$(TFHECUDA_BUILD)" && \
|
||||
cd "$(TFHECUDA_BUILD)" && \
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Debug -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON -DTFHE_CUDA_BACKEND_BUILD_BENCHMARKS=ON && \
|
||||
make -j "$(CPU_COUNT)"
|
||||
"$(MAKE)" -j "$(CPU_COUNT)"
|
||||
|
||||
.PHONY: build_nodejs_test_docker # Build a docker image with tools to run nodejs tests for wasm API
|
||||
build_nodejs_test_docker:
|
||||
@@ -798,7 +833,7 @@ test_nodejs_wasm_api_in_docker: build_nodejs_test_docker
|
||||
|
||||
.PHONY: test_nodejs_wasm_api # Run tests for the nodejs on wasm API
|
||||
test_nodejs_wasm_api: build_node_js_api
|
||||
cd tfhe && node --test js_on_wasm_tests
|
||||
cd tfhe/js_on_wasm_tests && npm run test
|
||||
|
||||
.PHONY: test_web_js_api_parallel # Run tests for the web wasm api
|
||||
test_web_js_api_parallel: build_web_js_api_parallel
|
||||
@@ -878,7 +913,7 @@ bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- unsigned
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- ::unsigned
|
||||
|
||||
.PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
|
||||
bench_integer_zk: install_rs_check_toolchain
|
||||
@@ -895,16 +930,12 @@ bench_shortint: install_rs_check_toolchain
|
||||
--bench shortint-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: bench_oprf # Run benchmarks for shortint
|
||||
bench_oprf: install_rs_check_toolchain
|
||||
.PHONY: bench_shortint_oprf # Run benchmarks for shortint
|
||||
bench_shortint_oprf: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench oprf-shortint-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
RUSTFLAGS="$(RUSTFLAGS)" \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench oprf-integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
|
||||
bench_shortint_multi_bit: install_rs_check_toolchain
|
||||
@@ -934,7 +965,7 @@ bench_pbs128: install_rs_check_toolchain
|
||||
|
||||
.PHONY: bench_pbs_gpu # Run benchmarks for PBS on GPU backend
|
||||
bench_pbs_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_FAST_BENCH=$(FAST_BENCH) cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench pbs-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
|
||||
@@ -957,7 +988,7 @@ bench_web_js_api_parallel: build_web_js_api_parallel
|
||||
.PHONY: bench_web_js_api_parallel_ci # Run benchmarks for the web wasm api
|
||||
bench_web_js_api_parallel_ci: build_web_js_api_parallel
|
||||
source ~/.nvm/nvm.sh && \
|
||||
nvm use node && \
|
||||
nvm use $(NODE_VERSION) && \
|
||||
$(MAKE) -C tfhe/web_wasm_parallel_tests bench-ci
|
||||
|
||||
#
|
||||
@@ -1016,7 +1047,7 @@ write_params_to_file: install_rs_check_toolchain
|
||||
|
||||
.PHONY: clone_backward_compat_data # Clone the data repo needed for backward compatibility tests
|
||||
clone_backward_compat_data:
|
||||
./scripts/clone_backward_compat_data.sh $(BACKWARD_COMPAT_DATA_URL) tfhe/$(BACKWARD_COMPAT_DATA_DIR)
|
||||
./scripts/clone_backward_compat_data.sh $(BACKWARD_COMPAT_DATA_URL) $(BACKWARD_COMPAT_DATA_BRANCH) tfhe/$(BACKWARD_COMPAT_DATA_DIR)
|
||||
|
||||
tfhe/$(BACKWARD_COMPAT_DATA_DIR): clone_backward_compat_data
|
||||
|
||||
@@ -1046,7 +1077,7 @@ sha256_bool: install_rs_check_toolchain
|
||||
|
||||
.PHONY: pcc # pcc stands for pre commit checks (except GPU)
|
||||
pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested check_intra_md_links \
|
||||
clippy_all check_compile_tests
|
||||
clippy_all tfhe_lints check_compile_tests
|
||||
|
||||
.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
|
||||
pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu
|
||||
|
||||
@@ -4,9 +4,8 @@ use tfhe::{generate_keys, ConfigBuilder, FheUint64, FheUint8};
|
||||
use tfhe_trivium::{KreyviumStreamByte, TransCiphering};
|
||||
|
||||
pub fn kreyvium_byte_gen(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::default()
|
||||
.enable_function_evaluation()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
|
||||
@@ -33,9 +32,8 @@ pub fn kreyvium_byte_gen(c: &mut Criterion) {
|
||||
}
|
||||
|
||||
pub fn kreyvium_byte_trans(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::default()
|
||||
.enable_function_evaluation()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
|
||||
@@ -63,9 +61,8 @@ pub fn kreyvium_byte_trans(c: &mut Criterion) {
|
||||
}
|
||||
|
||||
pub fn kreyvium_byte_warmup(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::default()
|
||||
.enable_function_evaluation()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
|
||||
|
||||
@@ -119,7 +119,7 @@ impl KreyviumStreamByte<FheUint8> {
|
||||
}
|
||||
|
||||
// Key and iv are stored in reverse in their shift registers
|
||||
let mut key = key_bytes.map(|b| b.map(|x| (x as u8).reverse_bits() as u64));
|
||||
let mut key = key_bytes.map(|b| b.reverse_bits());
|
||||
let mut iv = iv_bytes.map(|x| FheUint8::encrypt_trivial(x.reverse_bits()));
|
||||
key.reverse();
|
||||
iv.reverse();
|
||||
|
||||
@@ -299,9 +299,8 @@ fn kreyvium_test_clear_byte() {
|
||||
|
||||
#[test]
|
||||
fn kreyvium_test_byte_long() {
|
||||
let config = ConfigBuilder::default()
|
||||
.enable_function_evaluation()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
|
||||
@@ -338,9 +337,8 @@ fn kreyvium_test_byte_long() {
|
||||
|
||||
#[test]
|
||||
fn kreyvium_test_fhe_byte_transciphering_long() {
|
||||
let config = ConfigBuilder::default()
|
||||
.enable_function_evaluation()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tfhe-cuda-backend"
|
||||
version = "0.3.0"
|
||||
version = "0.4.0-alpha.0"
|
||||
edition = "2021"
|
||||
authors = ["Zama team"]
|
||||
license = "BSD-3-Clause-Clear"
|
||||
|
||||
@@ -8,6 +8,18 @@ fn main() {
|
||||
}
|
||||
}
|
||||
|
||||
// This is a workaround to the current nightly toolchain (2024-06-27 which started with
|
||||
// toolchain 2024-05-05) build issue
|
||||
// Essentially if cbindgen is running, a wrong argument ends up forwarded to the cuda backend
|
||||
// "make" command during macro expansions for TFHE-rs C API, crashing make for make < 4.4 and
|
||||
// thus crashing the build
|
||||
// On the other hand, this speeds up C API build greatly given we don't have macro expansions
|
||||
// in the CUDA backend so this skips the second compilation of TFHE-rs for macro inspection by
|
||||
// cbindgen
|
||||
if std::env::var("_CBINDGEN_IS_RUNNING").is_ok() {
|
||||
return;
|
||||
}
|
||||
|
||||
println!("Build tfhe-cuda-backend");
|
||||
println!("cargo::rerun-if-changed=cuda/include");
|
||||
println!("cargo::rerun-if-changed=cuda/src");
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#ifndef CUDA_CIPHERTEXT_H
|
||||
#define CUDA_CIPHERTEXT_H
|
||||
|
||||
#include "device.h"
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
@@ -14,5 +15,11 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
|
||||
void *dest, void *src,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension);
|
||||
|
||||
void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out, void *glwe_array_in,
|
||||
uint32_t *nth_array, uint32_t num_glwes,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size);
|
||||
};
|
||||
#endif
|
||||
|
||||
@@ -64,14 +64,8 @@ void cuda_drop(void *ptr, uint32_t gpu_index);
|
||||
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index);
|
||||
|
||||
void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
|
||||
cudaStreamCallback_t callback, void *user_data);
|
||||
}
|
||||
|
||||
void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
|
||||
void *host_pointer);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *d_array, Torus value, Torus n);
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
#ifndef HELPER_MULTI_GPU_H
|
||||
#define HELPER_MULTI_GPU_H
|
||||
#include <mutex>
|
||||
#include <variant>
|
||||
#include <vector>
|
||||
|
||||
extern std::mutex m;
|
||||
extern bool p2p_enabled;
|
||||
@@ -9,6 +11,20 @@ extern "C" {
|
||||
int cuda_setup_multi_gpu();
|
||||
}
|
||||
|
||||
// Define a variant type that can be either a vector or a single pointer
|
||||
template <typename Torus>
|
||||
using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
|
||||
|
||||
// Macro to define the visitor logic using std::holds_alternative for vectors
|
||||
#define GET_VARIANT_ELEMENT(variant, index) \
|
||||
[&] { \
|
||||
if (std::holds_alternative<std::vector<Torus *>>(variant)) { \
|
||||
return std::get<std::vector<Torus *>>(variant)[index]; \
|
||||
} else { \
|
||||
return std::get<Torus *>(variant); \
|
||||
} \
|
||||
}()
|
||||
|
||||
int get_active_gpu_count(int num_inputs, int gpu_count);
|
||||
|
||||
int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
|
||||
|
||||
@@ -35,8 +35,11 @@ enum COMPARISON_TYPE {
|
||||
MAX = 6,
|
||||
MIN = 7,
|
||||
};
|
||||
|
||||
enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };
|
||||
|
||||
enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
|
||||
|
||||
extern "C" {
|
||||
void scratch_cuda_apply_univariate_lut_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
@@ -81,9 +84,8 @@ void scratch_cuda_full_propagation_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_full_propagation_64_inplace(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, void *input_blocks,
|
||||
@@ -99,7 +101,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
|
||||
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory);
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -281,7 +283,7 @@ void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
@@ -290,15 +292,14 @@ void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
|
||||
int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix);
|
||||
|
||||
void cleanup_cuda_integer_radix_sum_ciphertexts_vec(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_radix_overflowing_sub_kb_64(
|
||||
void **stream, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
@@ -355,6 +356,48 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
void cleanup_cuda_integer_div_rem(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lhs,
|
||||
void *rhs, void *overflowed, int8_t signed_operation, int8_t *mem_ptr,
|
||||
void **bsks, void **ksks, uint32_t num_blocks_in_radix);
|
||||
|
||||
void cleanup_signed_overflowing_add_or_sub(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
|
||||
void **bsks, uint32_t num_blocks, uint32_t shift);
|
||||
|
||||
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cuda_integer_reverse_blocks_64_inplace(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t num_blocks,
|
||||
uint32_t lwe_size);
|
||||
|
||||
} // extern C
|
||||
|
||||
template <typename Torus>
|
||||
@@ -466,11 +509,21 @@ template <typename Torus> struct int_radix_lut {
|
||||
// for the moment
|
||||
Torus *lwe_indexes_in;
|
||||
Torus *lwe_indexes_out;
|
||||
Torus *h_lwe_indexes_in;
|
||||
Torus *h_lwe_indexes_out;
|
||||
// Enable optimizations if lwe_indexes_(in/out) are trivial
|
||||
bool using_trivial_lwe_indexes = true;
|
||||
// lwe_trivial_indexes is the intermediary index we need in case
|
||||
// lwe_indexes_in != lwe_indexes_out
|
||||
Torus *lwe_trivial_indexes;
|
||||
Torus *tmp_lwe_before_ks;
|
||||
Torus *tmp_lwe_after_ks;
|
||||
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
std::vector<Torus *> lwe_array_in_vec;
|
||||
std::vector<Torus *> lwe_after_ks_vec;
|
||||
std::vector<Torus *> lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec;
|
||||
|
||||
int_radix_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
|
||||
@@ -490,13 +543,12 @@ template <typename Torus> struct int_radix_lut {
|
||||
cudaSetDevice(i);
|
||||
int8_t *gpu_pbs_buffer;
|
||||
auto num_blocks_on_gpu =
|
||||
get_num_inputs_on_gpu(num_radix_blocks, i, gpu_count);
|
||||
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
|
||||
|
||||
execute_scratch_pbs<Torus>(
|
||||
streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension,
|
||||
params.small_lwe_dimension, params.polynomial_size, params.pbs_level,
|
||||
params.grouping_factor, num_blocks_on_gpu,
|
||||
cuda_get_max_shared_memory(gpu_indexes[i]), params.pbs_type,
|
||||
params.grouping_factor, num_blocks_on_gpu, params.pbs_type,
|
||||
allocate_gpu_memory);
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
buffer.push_back(gpu_pbs_buffer);
|
||||
@@ -530,22 +582,43 @@ template <typename Torus> struct int_radix_lut {
|
||||
num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
lwe_trivial_indexes = (Torus *)cuda_malloc_async(
|
||||
num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
|
||||
|
||||
h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
|
||||
h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
|
||||
|
||||
for (int i = 0; i < num_radix_blocks; i++)
|
||||
h_lwe_indexes[i] = i;
|
||||
h_lwe_indexes_in[i] = i;
|
||||
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes,
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes,
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
|
||||
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_stream_add_callback(streams[0], gpu_indexes[0],
|
||||
host_free_on_stream_callback, h_lwe_indexes);
|
||||
memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus));
|
||||
|
||||
/// With multiple GPUs we allocate arrays to be pushed to the vectors and
|
||||
/// copy data on each GPU then when we gather data to GPU 0 we can copy
|
||||
/// back to the original indexing
|
||||
multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_array_in_vec, num_radix_blocks,
|
||||
params.big_lwe_dimension + 1);
|
||||
multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_after_ks_vec, num_radix_blocks,
|
||||
params.small_lwe_dimension + 1);
|
||||
multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_after_pbs_vec, num_radix_blocks,
|
||||
params.big_lwe_dimension + 1);
|
||||
multi_gpu_alloc_array_async(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_trivial_indexes_vec, num_radix_blocks);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
multi_gpu_copy_array_async(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_trivial_indexes_vec, lwe_trivial_indexes,
|
||||
num_radix_blocks);
|
||||
|
||||
// Keyswitch
|
||||
Torus big_size =
|
||||
@@ -554,8 +627,6 @@ template <typename Torus> struct int_radix_lut {
|
||||
(params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
|
||||
tmp_lwe_before_ks =
|
||||
(Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
|
||||
tmp_lwe_after_ks =
|
||||
(Torus *)cuda_malloc_async(small_size, streams[0], gpu_indexes[0]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -577,7 +648,14 @@ template <typename Torus> struct int_radix_lut {
|
||||
buffer = base_lut_object->buffer;
|
||||
// Keyswitch
|
||||
tmp_lwe_before_ks = base_lut_object->tmp_lwe_before_ks;
|
||||
tmp_lwe_after_ks = base_lut_object->tmp_lwe_after_ks;
|
||||
|
||||
/// With multiple GPUs we allocate arrays to be pushed to the vectors and
|
||||
/// copy data on each GPU then when we gather data to GPU 0 we can copy back
|
||||
/// to the original indexing
|
||||
lwe_array_in_vec = base_lut_object->lwe_array_in_vec;
|
||||
lwe_after_ks_vec = base_lut_object->lwe_after_ks_vec;
|
||||
lwe_after_pbs_vec = base_lut_object->lwe_after_pbs_vec;
|
||||
lwe_trivial_indexes_vec = base_lut_object->lwe_trivial_indexes_vec;
|
||||
|
||||
mem_reuse = true;
|
||||
|
||||
@@ -609,22 +687,24 @@ template <typename Torus> struct int_radix_lut {
|
||||
num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
lwe_trivial_indexes = (Torus *)cuda_malloc_async(
|
||||
num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
|
||||
|
||||
h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
|
||||
h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
|
||||
|
||||
for (int i = 0; i < num_radix_blocks; i++)
|
||||
h_lwe_indexes[i] = i;
|
||||
h_lwe_indexes_in[i] = i;
|
||||
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes,
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes,
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
|
||||
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_stream_add_callback(streams[0], gpu_indexes[0],
|
||||
host_free_on_stream_callback, h_lwe_indexes);
|
||||
memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus));
|
||||
}
|
||||
|
||||
// Return a pointer to idx-ith lut at gpu_index's global memory
|
||||
@@ -642,6 +722,22 @@ template <typename Torus> struct int_radix_lut {
|
||||
return &lut_indexes[ind];
|
||||
}
|
||||
|
||||
// If this function is called we assume the lwe_indexes_(in/out) are not the
|
||||
// trivial anymore and thus we disable optimizations
|
||||
void set_lwe_indexes(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *h_indexes_in, Torus *h_indexes_out) {
|
||||
|
||||
memcpy(h_lwe_indexes_in, h_indexes_in, num_blocks * sizeof(Torus));
|
||||
memcpy(h_lwe_indexes_out, h_indexes_out, num_blocks * sizeof(Torus));
|
||||
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
|
||||
num_blocks * sizeof(Torus), stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_out,
|
||||
num_blocks * sizeof(Torus), stream, gpu_index);
|
||||
|
||||
using_trivial_lwe_indexes = false;
|
||||
}
|
||||
|
||||
// Broadcast luts from gpu src_gpu_idx to all active gpus
|
||||
void broadcast_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t src_gpu_idx) {
|
||||
@@ -651,7 +747,6 @@ template <typename Torus> struct int_radix_lut {
|
||||
auto src_lut_indexes = lut_indexes_vec[src_gpu_idx];
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
#pragma omp parallel for num_threads(active_gpu_count)
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
if (i != src_gpu_idx) {
|
||||
auto dst_lut = lut_vec[i];
|
||||
@@ -669,7 +764,6 @@ template <typename Torus> struct int_radix_lut {
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
#pragma omp parallel for num_threads(active_gpu_count)
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_drop_async(lut_vec[i], streams[i], gpu_indexes[i]);
|
||||
cuda_drop_async(lut_indexes_vec[i], streams[i], gpu_indexes[i]);
|
||||
@@ -680,9 +774,13 @@ template <typename Torus> struct int_radix_lut {
|
||||
cuda_drop_async(lwe_indexes_in, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(lwe_indexes_out, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(lwe_trivial_indexes, streams[0], gpu_indexes[0]);
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
free(h_lwe_indexes_in);
|
||||
free(h_lwe_indexes_out);
|
||||
|
||||
if (!mem_reuse) {
|
||||
cuda_drop_async(tmp_lwe_before_ks, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(tmp_lwe_after_ks, streams[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
for (int i = 0; i < buffer.size(); i++) {
|
||||
switch (params.pbs_type) {
|
||||
@@ -700,6 +798,17 @@ template <typename Torus> struct int_radix_lut {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
buffer.clear();
|
||||
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_array_in_vec);
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_after_ks_vec);
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_after_pbs_vec);
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_trivial_indexes_vec);
|
||||
for (uint i = 0; i < active_gpu_count; i++)
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
lwe_array_in_vec.clear();
|
||||
lwe_after_ks_vec.clear();
|
||||
lwe_after_pbs_vec.clear();
|
||||
lwe_trivial_indexes_vec.clear();
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -749,8 +858,6 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
num_radix_blocks * bits_per_block * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
cuda_stream_add_callback(streams[0], gpu_indexes[0],
|
||||
host_free_on_stream_callback, h_lut_indexes);
|
||||
|
||||
/**
|
||||
* the input indexes should take the first bits_per_block PBS to target
|
||||
@@ -763,12 +870,6 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
for (int i = 0; i < bits_per_block; i++)
|
||||
h_lwe_indexes_in[i + j * bits_per_block] = j;
|
||||
}
|
||||
cuda_memcpy_async_to_gpu(lut->lwe_indexes_in, h_lwe_indexes_in,
|
||||
num_radix_blocks * bits_per_block *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_stream_add_callback(streams[0], gpu_indexes[0],
|
||||
host_free_on_stream_callback, h_lwe_indexes_in);
|
||||
|
||||
/**
|
||||
* the output should aim different lwe ciphertexts, so lwe_indexes_out =
|
||||
@@ -780,12 +881,13 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
for (int i = 0; i < num_radix_blocks * bits_per_block; i++)
|
||||
h_lwe_indexes_out[i] = i;
|
||||
|
||||
cuda_memcpy_async_to_gpu(lut->lwe_indexes_out, h_lwe_indexes_out,
|
||||
num_radix_blocks * bits_per_block *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_stream_add_callback(streams[0], gpu_indexes[0],
|
||||
host_free_on_stream_callback, h_lwe_indexes_out);
|
||||
lut->set_lwe_indexes(streams[0], gpu_indexes[0], h_lwe_indexes_in,
|
||||
h_lwe_indexes_out);
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
free(h_lut_indexes);
|
||||
free(h_lwe_indexes_in);
|
||||
free(h_lwe_indexes_out);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -955,10 +1057,10 @@ template <typename Torus> struct int_fullprop_buffer {
|
||||
|
||||
int_fullprop_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
|
||||
bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
lut = new int_radix_lut<Torus>(streams, gpu_indexes, 1, params, 2,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
lut = new int_radix_lut<Torus>(streams, gpu_indexes, 1, params, 2, 2,
|
||||
allocate_gpu_memory);
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
|
||||
@@ -984,15 +1086,13 @@ template <typename Torus> struct int_fullprop_buffer {
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f_carry);
|
||||
|
||||
Torus lwe_indexes_size = num_radix_blocks * sizeof(Torus);
|
||||
Torus lwe_indexes_size = 2 * sizeof(Torus);
|
||||
Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
|
||||
for (int i = 0; i < num_radix_blocks; i++)
|
||||
for (int i = 0; i < 2; i++)
|
||||
h_lwe_indexes[i] = i;
|
||||
Torus *lwe_indexes = lut->get_lut_indexes(gpu_indexes[0], 0);
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_stream_add_callback(streams[0], gpu_indexes[0],
|
||||
host_free_on_stream_callback, h_lwe_indexes);
|
||||
|
||||
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
@@ -1007,6 +1107,8 @@ template <typename Torus> struct int_fullprop_buffer {
|
||||
small_vector_size, streams[0], gpu_indexes[0]);
|
||||
tmp_big_lwe_vector = (Torus *)cuda_malloc_async(
|
||||
big_vector_size, streams[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
free(h_lwe_indexes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1014,6 +1116,7 @@ template <typename Torus> struct int_fullprop_buffer {
|
||||
uint32_t gpu_count) {
|
||||
|
||||
lut->release(streams, gpu_indexes, 1);
|
||||
delete lut;
|
||||
|
||||
cuda_drop_async(tmp_small_lwe_vector, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(tmp_big_lwe_vector, streams[0], gpu_indexes[0]);
|
||||
@@ -1135,7 +1238,7 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_single_borrow_prop_memory {
|
||||
template <typename Torus> struct int_overflowing_sub_memory {
|
||||
Torus *generates_or_propagates;
|
||||
Torus *step_output;
|
||||
|
||||
@@ -1147,10 +1250,10 @@ template <typename Torus> struct int_single_borrow_prop_memory {
|
||||
|
||||
int_radix_params params;
|
||||
|
||||
int_single_borrow_prop_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
int_overflowing_sub_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
@@ -1335,60 +1438,6 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_overflowing_sub_memory {
|
||||
int_radix_params params;
|
||||
int_radix_lut<Torus> *luts_message_carry;
|
||||
int_single_borrow_prop_memory<Torus> *borrow_prop_mem;
|
||||
int_overflowing_sub_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t num_blocks, bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
borrow_prop_mem = new int_single_borrow_prop_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks,
|
||||
allocate_gpu_memory);
|
||||
|
||||
int max_pbs_count = num_blocks * 2;
|
||||
|
||||
// create lut object for message and carry
|
||||
luts_message_carry =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 2,
|
||||
max_pbs_count, allocate_gpu_memory);
|
||||
|
||||
auto message_acc = luts_message_carry->get_lut(gpu_indexes[0], 0);
|
||||
auto carry_acc = luts_message_carry->get_lut(gpu_indexes[0], 1);
|
||||
|
||||
// define functions for each accumulator
|
||||
auto lut_f_message = [message_modulus](Torus x) -> Torus {
|
||||
return x % message_modulus;
|
||||
};
|
||||
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
|
||||
return x / message_modulus;
|
||||
};
|
||||
|
||||
// generate accumulators
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], message_acc, params.glwe_dimension,
|
||||
params.polynomial_size, message_modulus, carry_modulus, lut_f_message);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], carry_acc, params.glwe_dimension,
|
||||
params.polynomial_size, message_modulus, carry_modulus, lut_f_carry);
|
||||
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
luts_message_carry->release(streams, gpu_indexes, gpu_count);
|
||||
borrow_prop_mem->release(streams, gpu_indexes, gpu_count);
|
||||
|
||||
delete luts_message_carry;
|
||||
delete borrow_prop_mem;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_mul_memory {
|
||||
Torus *vector_result_sb;
|
||||
Torus *block_mul_res;
|
||||
@@ -1700,6 +1749,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
|
||||
cudaStream_t *local_streams_1;
|
||||
cudaStream_t *local_streams_2;
|
||||
uint32_t active_gpu_count;
|
||||
|
||||
int_arithmetic_scalar_shift_buffer(cudaStream_t *streams,
|
||||
uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -1707,12 +1757,15 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
int_radix_params params,
|
||||
uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
// In the arithmetic shift, a PBS has to be applied to the last rotated
|
||||
// block twice: once to shift it, once to compute the padding block to be
|
||||
// copied onto all blocks to the left of the last rotated block
|
||||
local_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
local_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
local_streams_1 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
local_streams_2 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
local_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
local_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
}
|
||||
@@ -1723,12 +1776,12 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
uint32_t big_lwe_size = params.big_lwe_dimension + 1;
|
||||
uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
tmp_rotated = (Torus *)cuda_malloc_async((num_radix_blocks + 2) *
|
||||
tmp_rotated = (Torus *)cuda_malloc_async((num_radix_blocks + 3) *
|
||||
big_lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
cuda_memset_async(tmp_rotated, 0,
|
||||
(num_radix_blocks + 2) * big_lwe_size_bytes, streams[0],
|
||||
(num_radix_blocks + 3) * big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
uint32_t num_bits_in_block = (uint32_t)std::log2(params.message_modulus);
|
||||
@@ -1845,7 +1898,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
cuda_destroy_stream(local_streams_1[j], gpu_indexes[j]);
|
||||
cuda_destroy_stream(local_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
@@ -1874,20 +1927,24 @@ template <typename Torus> struct int_zero_out_if_buffer {
|
||||
|
||||
cudaStream_t *true_streams;
|
||||
cudaStream_t *false_streams;
|
||||
uint32_t active_gpu_count;
|
||||
|
||||
int_zero_out_if_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
|
||||
Torus big_size =
|
||||
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
|
||||
if (allocate_gpu_memory) {
|
||||
tmp = (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
|
||||
// We may use a different stream to allow concurrent operation
|
||||
true_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
false_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
true_streams =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
false_streams =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
true_streams[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
false_streams[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
}
|
||||
@@ -1896,7 +1953,7 @@ template <typename Torus> struct int_zero_out_if_buffer {
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_async(tmp, streams[0], gpu_indexes[0]);
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
cuda_destroy_stream(true_streams[j], gpu_indexes[j]);
|
||||
cuda_destroy_stream(false_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
@@ -2046,6 +2103,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
|
||||
uint32_t gpu_count) {
|
||||
for (auto &lut : is_equal_to_lut_map) {
|
||||
lut.second->release(streams, gpu_indexes, gpu_count);
|
||||
delete (lut.second);
|
||||
}
|
||||
is_equal_to_lut_map.clear();
|
||||
|
||||
@@ -2192,13 +2250,11 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
|
||||
tree_last_leaf_lut =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
tree_last_leaf_lut = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);
|
||||
|
||||
tree_last_leaf_scalar_lut =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
tree_last_leaf_scalar_lut = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
tree_inner_leaf_lut->get_lut(gpu_indexes[0], 0),
|
||||
@@ -2330,6 +2386,7 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
int_radix_lut<Torus> *signed_msb_lut;
|
||||
cudaStream_t *lsb_streams;
|
||||
cudaStream_t *msb_streams;
|
||||
uint32_t active_gpu_count;
|
||||
|
||||
int_comparison_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, COMPARISON_TYPE op,
|
||||
@@ -2339,14 +2396,18 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
this->op = op;
|
||||
this->is_signed = is_signed;
|
||||
|
||||
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
|
||||
identity_lut_f = [](Torus x) -> Torus { return x; };
|
||||
|
||||
auto big_lwe_size = params.big_lwe_dimension + 1;
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
lsb_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
msb_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
lsb_streams =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
msb_streams =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
lsb_streams[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
msb_streams[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
}
|
||||
@@ -2510,7 +2571,7 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
signed_msb_lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete (signed_msb_lut);
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
cuda_destroy_stream(lsb_streams[j], gpu_indexes[j]);
|
||||
cuda_destroy_stream(msb_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
@@ -2521,6 +2582,7 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
|
||||
template <typename Torus> struct int_div_rem_memory {
|
||||
int_radix_params params;
|
||||
uint32_t active_gpu_count;
|
||||
|
||||
// memory objects for other operations
|
||||
int_logical_scalar_shift_buffer<Torus> *shift_mem_1;
|
||||
@@ -2627,7 +2689,7 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
[shifted_mask](Torus x) -> Torus { return x & shifted_mask; };
|
||||
|
||||
masking_luts_1[i] = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 1, num_blocks, true);
|
||||
streams, gpu_indexes, gpu_count, params, 1, 1, true);
|
||||
masking_luts_2[i] = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 1, num_blocks, true);
|
||||
|
||||
@@ -2741,7 +2803,7 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
};
|
||||
|
||||
merge_overflow_flags_luts[i] = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 1, num_blocks, true);
|
||||
streams, gpu_indexes, gpu_count, params, 1, 1, true);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
@@ -2756,6 +2818,8 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
int_div_rem_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t num_blocks, bool allocate_gpu_memory) {
|
||||
active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
|
||||
|
||||
this->params = params;
|
||||
shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
|
||||
@@ -2775,11 +2839,15 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
init_lookup_tables(streams, gpu_indexes, gpu_count, num_blocks);
|
||||
init_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks);
|
||||
|
||||
sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_3 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_4 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
sub_streams_1 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_2 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_3 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_4 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_3[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
@@ -2850,7 +2918,7 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
delete[] merge_overflow_flags_luts;
|
||||
|
||||
// release sub streams
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
|
||||
cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
|
||||
cuda_destroy_stream(sub_streams_3[i], gpu_indexes[i]);
|
||||
@@ -2884,6 +2952,247 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_last_block_inner_propagate_memory {
|
||||
|
||||
int_radix_lut<Torus> *last_block_inner_propagation_lut;
|
||||
int_radix_params params;
|
||||
|
||||
int_last_block_inner_propagate_memory(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_radix_params params, SIGNED_OPERATION op, uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
this->params = params;
|
||||
auto message_modulus = params.message_modulus;
|
||||
uint32_t bits_of_message =
|
||||
static_cast<uint32_t>(std::log2(params.message_modulus));
|
||||
Torus message_bit_mask = (1 << bits_of_message) - 1;
|
||||
|
||||
// declare lambda function for last_block_inner_propagation_lut generation
|
||||
auto f_last_block_inner_propagation_lut =
|
||||
[op, message_modulus, message_bit_mask,
|
||||
bits_of_message](Torus lhs_block, Torus rhs_block) -> Torus {
|
||||
uint64_t rhs_block_modified;
|
||||
if (op == SIGNED_OPERATION::SUBTRACTION) {
|
||||
// Subtraction is done by adding the negation
|
||||
// Negation(x) = bit_flip(x) + 1
|
||||
// Only add the flipped value, the +1 will be resolved by carry
|
||||
// propagation computation
|
||||
uint64_t flipped_rhs = ~rhs_block;
|
||||
|
||||
// Remove the last bit, it's not interesting in this step
|
||||
rhs_block_modified = (flipped_rhs << 1) & message_bit_mask;
|
||||
} else {
|
||||
rhs_block_modified = (rhs_block << 1) & message_bit_mask;
|
||||
}
|
||||
|
||||
uint64_t lhs_block_modified = (lhs_block << 1) & message_bit_mask;
|
||||
|
||||
// whole_result contains the result of addition with
|
||||
// the carry being in the first bit of carry space
|
||||
// the message space contains the message, but with one 0
|
||||
// on the right (LSB)
|
||||
uint64_t whole_result = lhs_block_modified + rhs_block_modified;
|
||||
uint64_t carry = whole_result >> bits_of_message;
|
||||
uint64_t result = (whole_result & message_bit_mask) >> 1;
|
||||
OUTPUT_CARRY propagation_result;
|
||||
if (carry == 1) {
|
||||
// Addition of bits before the last one generates a carry
|
||||
propagation_result = OUTPUT_CARRY::GENERATED;
|
||||
} else if (result == ((message_modulus - 1) >> 1)) {
|
||||
// Addition of bits before the last one puts the bits
|
||||
// in a state that makes it so that an input carry into the last block
|
||||
// gets propagated to the last bit.
|
||||
propagation_result = OUTPUT_CARRY::PROPAGATED;
|
||||
} else {
|
||||
propagation_result = OUTPUT_CARRY::NONE;
|
||||
}
|
||||
|
||||
// Shift the propagation result in the carry part
|
||||
// to have less noise growth later
|
||||
return (static_cast<uint64_t>(propagation_result) << bits_of_message);
|
||||
};
|
||||
|
||||
last_block_inner_propagation_lut = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
last_block_inner_propagation_lut->get_lut(gpu_indexes[0], 0),
|
||||
params.glwe_dimension, params.polynomial_size, message_modulus,
|
||||
params.carry_modulus, f_last_block_inner_propagation_lut);
|
||||
last_block_inner_propagation_lut->broadcast_lut(streams, gpu_indexes,
|
||||
gpu_indexes[0]);
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
last_block_inner_propagation_lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete last_block_inner_propagation_lut;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_resolve_signed_overflow_memory {
|
||||
|
||||
int_radix_lut<Torus> *resolve_overflow_lut;
|
||||
int_radix_params params;
|
||||
|
||||
Torus *x;
|
||||
|
||||
int_resolve_signed_overflow_memory(cudaStream_t *streams,
|
||||
uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
this->params = params;
|
||||
|
||||
auto message_modulus = params.message_modulus;
|
||||
|
||||
uint32_t bits_of_message =
|
||||
static_cast<uint32_t>(std::log2(message_modulus));
|
||||
|
||||
x = (Torus *)cuda_malloc_async((params.big_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
// declare lambda function for resolve_overflow_lut generation
|
||||
auto f_resolve_overflow_lut = [bits_of_message](Torus x) -> Torus {
|
||||
Torus carry_propagation = x >> bits_of_message;
|
||||
Torus output_carry_of_block = (x >> 1) & 1;
|
||||
Torus input_carry_of_block = x & 1;
|
||||
|
||||
// Resolve the carry that the last bit actually receives as input
|
||||
Torus input_carry_to_last_bit;
|
||||
if (carry_propagation == OUTPUT_CARRY::PROPAGATED) {
|
||||
input_carry_to_last_bit = input_carry_of_block;
|
||||
} else if (carry_propagation == OUTPUT_CARRY::GENERATED) {
|
||||
input_carry_to_last_bit = 1;
|
||||
} else {
|
||||
input_carry_to_last_bit = 0;
|
||||
};
|
||||
|
||||
return input_carry_to_last_bit != output_carry_of_block;
|
||||
};
|
||||
|
||||
resolve_overflow_lut = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
resolve_overflow_lut->get_lut(gpu_indexes[0], 0), params.glwe_dimension,
|
||||
params.polynomial_size, message_modulus, params.carry_modulus,
|
||||
f_resolve_overflow_lut);
|
||||
resolve_overflow_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
resolve_overflow_lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete resolve_overflow_lut;
|
||||
cuda_drop_async(x, streams[0], gpu_indexes[0]);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
|
||||
int_radix_params params;
|
||||
uint32_t active_gpu_count;
|
||||
|
||||
// memory objects for other operations
|
||||
int_sc_prop_memory<Torus> *scp_mem;
|
||||
int_last_block_inner_propagate_memory<Torus> *las_block_prop_mem;
|
||||
int_resolve_signed_overflow_memory<Torus> *resolve_overflow_mem;
|
||||
|
||||
// sub streams
|
||||
cudaStream_t *sub_streams_1;
|
||||
cudaStream_t *sub_streams_2;
|
||||
|
||||
// temporary device buffers
|
||||
Torus *result; // num_blocks
|
||||
Torus *input_carries; // num_blocks
|
||||
Torus *neg_rhs; // num_blocks
|
||||
Torus *output_carry; // single block
|
||||
Torus *last_block_inner_propagation; // single block
|
||||
|
||||
// allocate temporary arrays used to calculate
|
||||
// cuda integer signed overflowing add or sub
|
||||
void allocate_temporary_buffers(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, uint32_t num_blocks) {
|
||||
uint32_t big_lwe_size = params.big_lwe_dimension + 1;
|
||||
|
||||
result = (Torus *)cuda_malloc_async(
|
||||
big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
|
||||
neg_rhs = (Torus *)cuda_malloc_async(
|
||||
big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
|
||||
input_carries = (Torus *)cuda_malloc_async(
|
||||
big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
output_carry = (Torus *)cuda_malloc_async(big_lwe_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
last_block_inner_propagation = (Torus *)cuda_malloc_async(
|
||||
big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
}
|
||||
|
||||
// constructor without memory reuse
|
||||
int_signed_overflowing_add_or_sub_memory(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_radix_params params, uint32_t num_blocks, SIGNED_OPERATION op,
|
||||
bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
|
||||
|
||||
allocate_temporary_buffers(streams, gpu_indexes, active_gpu_count,
|
||||
num_blocks);
|
||||
|
||||
// initialize streams
|
||||
sub_streams_1 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_2 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
}
|
||||
|
||||
// initialize memory objects for other operations
|
||||
scp_mem =
|
||||
new int_sc_prop_memory<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
num_blocks, allocate_gpu_memory);
|
||||
las_block_prop_mem = new int_last_block_inner_propagate_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, op, num_blocks,
|
||||
allocate_gpu_memory);
|
||||
|
||||
resolve_overflow_mem = new int_resolve_signed_overflow_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
// memory objects for other operations
|
||||
scp_mem->release(streams, gpu_indexes, gpu_count);
|
||||
las_block_prop_mem->release(streams, gpu_indexes, gpu_count);
|
||||
resolve_overflow_mem->release(streams, gpu_indexes, gpu_count);
|
||||
|
||||
delete scp_mem;
|
||||
delete las_block_prop_mem;
|
||||
delete resolve_overflow_mem;
|
||||
|
||||
// temporary device buffers
|
||||
cuda_drop_async(result, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(neg_rhs, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(input_carries, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(output_carry, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(last_block_inner_propagation, streams[0], gpu_indexes[0]);
|
||||
|
||||
// sub streams
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
|
||||
cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
|
||||
}
|
||||
free(sub_streams_1);
|
||||
free(sub_streams_2);
|
||||
}
|
||||
};
|
||||
template <typename Torus> struct int_bitop_buffer {
|
||||
|
||||
int_radix_params params;
|
||||
@@ -2951,8 +3260,6 @@ template <typename Torus> struct int_bitop_buffer {
|
||||
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
}
|
||||
}
|
||||
|
||||
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
@@ -2981,8 +3288,8 @@ template <typename Torus> struct int_scalar_mul_buffer {
|
||||
size_t num_ciphertext_bits = msg_bits * num_radix_blocks;
|
||||
|
||||
//// Contains all shifted values of lhs for shift in range (0..msg_bits)
|
||||
//// The idea is that with these we can create all other shift that are in
|
||||
//// range (0..total_bits) for free (block rotation)
|
||||
//// The idea is that with these we can create all other shift that are
|
||||
/// in / range (0..total_bits) for free (block rotation)
|
||||
preshifted_buffer = (Torus *)cuda_malloc_async(
|
||||
num_ciphertext_bits * lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
|
||||
@@ -3010,9 +3317,8 @@ template <typename Torus> struct int_scalar_mul_buffer {
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
|
||||
sum_ciphertexts_vec_mem->release(streams, gpu_indexes, gpu_count);
|
||||
cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
|
||||
delete sum_ciphertexts_vec_mem;
|
||||
cuda_drop_async(all_shifted_buffer, streams[0], gpu_indexes[0]);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -9,15 +9,13 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t gpu_offset = 0);
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t gpu_offset = 0);
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
}
|
||||
|
||||
#endif // CNCRT_KS_H_
|
||||
|
||||
@@ -26,14 +26,12 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
|
||||
void scratch_cuda_programmable_bootstrap_amortized_32(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_amortized_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
@@ -41,8 +39,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory);
|
||||
uint32_t num_samples);
|
||||
|
||||
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
@@ -50,8 +47,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t gpu_offset = 0);
|
||||
uint32_t num_samples);
|
||||
|
||||
void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
|
||||
uint32_t gpu_index,
|
||||
@@ -60,14 +56,12 @@ void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
|
||||
void scratch_cuda_programmable_bootstrap_32(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
@@ -75,8 +69,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t gpu_offset = 0);
|
||||
uint32_t num_samples);
|
||||
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
@@ -84,44 +77,41 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t gpu_offset = 0);
|
||||
uint32_t num_samples);
|
||||
|
||||
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
|
||||
uint64_t get_buffer_size_programmable_bootstrap_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_programmable_bootstrap_step_one(
|
||||
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_one(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_programmable_bootstrap_step_two(
|
||||
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_two(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
uint64_t
|
||||
get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
uint64_t
|
||||
get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
@@ -129,21 +119,19 @@ get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_tbc(
|
||||
uint64_t get_buffer_size_partial_sm_programmable_bootstrap_tbc(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
|
||||
uint64_t get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // tbc
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
uint64_t
|
||||
get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
@@ -151,15 +139,14 @@ get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
uint64_t
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ bool
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap(
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory);
|
||||
bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
|
||||
template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
|
||||
|
||||
@@ -178,7 +165,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
|
||||
this->pbs_variant = pbs_variant;
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
switch (pbs_variant) {
|
||||
@@ -255,7 +242,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
|
||||
bool supports_dsm =
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory);
|
||||
Torus>(polynomial_size);
|
||||
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
|
||||
@@ -314,10 +301,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
};
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap_cg(
|
||||
uint64_t get_buffer_size_programmable_bootstrap_cg(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
@@ -343,8 +330,7 @@ template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count,
|
||||
uint32_t num_samples,
|
||||
uint32_t max_shared_memory);
|
||||
uint32_t num_samples);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
@@ -353,8 +339,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
@@ -363,8 +348,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
template <typename Torus>
|
||||
@@ -374,43 +358,44 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
template <typename Torus>
|
||||
void scratch_cuda_programmable_bootstrap_tbc(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
#endif
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
template <typename Torus>
|
||||
void scratch_cuda_programmable_bootstrap_cg(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
template <typename Torus>
|
||||
void scratch_cuda_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count,
|
||||
uint32_t max_shared_memory);
|
||||
uint32_t level_count);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
|
||||
template <typename T>
|
||||
__device__ const T *get_ith_mask_kth_block(const T *ptr, int i, int k,
|
||||
int level, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
@@ -422,8 +407,8 @@ __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
|
||||
int glwe_dimension, uint32_t level_count);
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
|
||||
__device__ const T *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
const T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -8,7 +8,7 @@ extern "C" {
|
||||
|
||||
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t max_shared_memory);
|
||||
uint32_t num_samples);
|
||||
|
||||
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
|
||||
void *stream, uint32_t gpu_index, void *dest, void *src,
|
||||
@@ -19,8 +19,7 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t chunk_size = 0);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
@@ -28,9 +27,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset,
|
||||
uint32_t lwe_chunk_size = 0);
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
uint32_t gpu_index,
|
||||
@@ -38,23 +35,21 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ bool
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory);
|
||||
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
uint32_t level_count);
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
template <typename Torus, typename STorus>
|
||||
template <typename Torus>
|
||||
void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
@@ -63,25 +58,14 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t gpu_offset, uint32_t lwe_chunk_size);
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
#endif
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
template <typename Torus>
|
||||
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
@@ -90,17 +74,14 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t gpu_offset, uint32_t lwe_chunk_size = 0);
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
template <typename Torus>
|
||||
void scratch_cuda_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
@@ -109,45 +90,34 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t gpu_offset, uint32_t lwe_chunk_size = 0);
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
|
||||
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
|
||||
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
|
||||
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
|
||||
uint64_t get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
|
||||
uint64_t get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
|
||||
uint64_t get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
|
||||
uint64_t get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint64_t get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
|
||||
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
@@ -317,8 +287,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
};
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t max_shared_memory);
|
||||
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
#endif // CUDA_MULTI_BIT_H
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "ciphertext.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
|
||||
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
|
||||
uint32_t gpu_index,
|
||||
@@ -19,3 +20,58 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)dest,
|
||||
(uint64_t *)src, number_of_cts, lwe_dimension);
|
||||
}
|
||||
|
||||
void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out, void *glwe_array_in,
|
||||
uint32_t *nth_array, uint32_t num_glwes,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 512:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 1024:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 2048:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 4096:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 8192:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 16384:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
|
||||
glwe_dimension);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: unsupported polynomial size. Supported "
|
||||
"N's are powers of two in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
#include "ciphertext.h"
|
||||
#include "device.h"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include <cstdint>
|
||||
|
||||
template <typename T>
|
||||
@@ -25,4 +26,39 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
|
||||
cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
|
||||
uint32_t *nth_array, uint32_t glwe_dimension) {
|
||||
|
||||
const int input_id = blockIdx.x;
|
||||
|
||||
const int glwe_input_size = (glwe_dimension + 1) * params::degree;
|
||||
const int lwe_output_size = glwe_dimension * params::degree + 1;
|
||||
|
||||
auto lwe_out = lwe_array_out + input_id * lwe_output_size;
|
||||
|
||||
// We assume each GLWE will store the first polynomial_size inputs
|
||||
uint32_t nth_per_glwe = params::degree;
|
||||
auto glwe_in = glwe_array_in + (input_id / nth_per_glwe) * glwe_input_size;
|
||||
|
||||
auto nth = nth_array[input_id];
|
||||
|
||||
sample_extract_mask<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
|
||||
sample_extract_body<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *lwe_array_out, Torus *glwe_array_in,
|
||||
uint32_t *nth_array, uint32_t num_glwes,
|
||||
uint32_t glwe_dimension) {
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
dim3 grid(num_glwes);
|
||||
dim3 thds(params::degree / params::opt);
|
||||
sample_extract<Torus, params><<<grid, thds, 0, stream>>>(
|
||||
lwe_array_out, glwe_array_in, nth_array, glwe_dimension);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -9,16 +9,14 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t gpu_offset) {
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes), static_cast<uint32_t *>(ksk),
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples,
|
||||
gpu_offset);
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
|
||||
}
|
||||
|
||||
/* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
|
||||
@@ -41,14 +39,12 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t gpu_offset) {
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples,
|
||||
gpu_offset);
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
|
||||
}
|
||||
|
||||
@@ -38,26 +38,25 @@ __device__ Torus *get_ith_block(Torus *ksk, int i, int level,
|
||||
// threads in y are used to paralelize the lwe_dimension_in loop.
|
||||
// shared memory is used to store intermediate results of the reduction.
|
||||
template <typename Torus>
|
||||
__global__ void keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
Torus *ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, int gpu_offset) {
|
||||
__global__ void
|
||||
keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const Torus *__restrict__ ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count) {
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int shmem_index = threadIdx.x + threadIdx.y * blockDim.x;
|
||||
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
Torus *lwe_acc_out = (Torus *)sharedmem;
|
||||
auto block_lwe_array_out =
|
||||
get_chunk(lwe_array_out, lwe_output_indexes[blockIdx.y + gpu_offset],
|
||||
lwe_dimension_out + 1);
|
||||
auto block_lwe_array_out = get_chunk(
|
||||
lwe_array_out, lwe_output_indexes[blockIdx.y], lwe_dimension_out + 1);
|
||||
|
||||
if (tid <= lwe_dimension_out) {
|
||||
|
||||
Torus local_lwe_out = 0;
|
||||
auto block_lwe_array_in =
|
||||
get_chunk(lwe_array_in, lwe_input_indexes[blockIdx.y + gpu_offset],
|
||||
lwe_dimension_in + 1);
|
||||
auto block_lwe_array_in = get_chunk(
|
||||
lwe_array_in, lwe_input_indexes[blockIdx.y], lwe_dimension_in + 1);
|
||||
|
||||
if (tid == lwe_dimension_out && threadIdx.y == 0) {
|
||||
local_lwe_out = block_lwe_array_in[lwe_dimension_in];
|
||||
@@ -103,8 +102,7 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t gpu_offset = 0) {
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
@@ -120,42 +118,40 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
|
||||
|
||||
keyswitch<Torus><<<grid, threads, shared_mem, stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, gpu_offset);
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void execute_keyswitch(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, Torus **ksks,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, bool sync_streams = true) {
|
||||
void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
const LweArrayVariant<Torus> &lwe_array_out,
|
||||
const LweArrayVariant<Torus> &lwe_output_indexes,
|
||||
const LweArrayVariant<Torus> &lwe_array_in,
|
||||
const LweArrayVariant<Torus> &lwe_input_indexes,
|
||||
Torus **ksks, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
/// If the number of radix blocks is lower than the number of GPUs, not all
|
||||
/// GPUs will be active and there will be 1 input per GPU
|
||||
auto active_gpu_count = get_active_gpu_count(num_samples, gpu_count);
|
||||
int num_samples_on_gpu_0 = get_num_inputs_on_gpu(num_samples, 0, gpu_count);
|
||||
if (sync_streams)
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
#pragma omp parallel for num_threads(active_gpu_count)
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
int num_samples_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
|
||||
int gpu_offset = get_gpu_offset(num_samples, i, gpu_count);
|
||||
|
||||
Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
|
||||
Torus *current_lwe_output_indexes =
|
||||
GET_VARIANT_ELEMENT(lwe_output_indexes, i);
|
||||
Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
|
||||
Torus *current_lwe_input_indexes =
|
||||
GET_VARIANT_ELEMENT(lwe_input_indexes, i);
|
||||
|
||||
// Compute Keyswitch
|
||||
cuda_keyswitch_lwe_ciphertext_vector<Torus>(
|
||||
streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
|
||||
lwe_array_in, lwe_input_indexes, ksks[i], lwe_dimension_in,
|
||||
lwe_dimension_out, base_log, level_count, num_samples_on_gpu,
|
||||
gpu_offset);
|
||||
streams[i], gpu_indexes[i], current_lwe_array_out,
|
||||
current_lwe_output_indexes, current_lwe_array_in,
|
||||
current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
|
||||
base_log, level_count, num_samples_on_gpu);
|
||||
}
|
||||
|
||||
if (sync_streams)
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -39,36 +39,19 @@ __device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void rescale_torus_element(T element, T &output,
|
||||
uint32_t log_shift) {
|
||||
output =
|
||||
round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
|
||||
(double)log_shift);
|
||||
__device__ __forceinline__ void modulus_switch(T input, T &output,
|
||||
uint32_t log_modulus) {
|
||||
constexpr uint32_t BITS = sizeof(T) * 8;
|
||||
|
||||
output = input + (((T)1) << (BITS - log_modulus - 1));
|
||||
output >>= (BITS - log_modulus);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T rescale_torus_element(T element,
|
||||
uint32_t log_shift) {
|
||||
return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
|
||||
(double)log_shift);
|
||||
__device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) {
|
||||
T output;
|
||||
modulus_switch(input, output, log_modulus);
|
||||
return output;
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ __forceinline__ void
|
||||
rescale_torus_element<uint32_t>(uint32_t element, uint32_t &output,
|
||||
uint32_t log_shift) {
|
||||
output =
|
||||
round(__uint2double_rn(element) /
|
||||
(__uint2double_rn(std::numeric_limits<uint32_t>::max()) + 1.0) *
|
||||
__uint2double_rn(log_shift));
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ __forceinline__ void
|
||||
rescale_torus_element<uint64_t>(uint64_t element, uint64_t &output,
|
||||
uint32_t log_shift) {
|
||||
output = round(__ull2double_rn(element) /
|
||||
(__ull2double_rn(std::numeric_limits<uint64_t>::max()) + 1.0) *
|
||||
__uint2double_rn(log_shift));
|
||||
}
|
||||
#endif // CNCRT_TORUS_H
|
||||
|
||||
@@ -166,19 +166,21 @@ __global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
|
||||
template <typename Torus>
|
||||
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *d_array, Torus value, Torus n) {
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, d_array));
|
||||
if (attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid dest device pointer in cuda set value.")
|
||||
}
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
int block_size = 256;
|
||||
int num_blocks = (n + block_size - 1) / block_size;
|
||||
if (n > 0) {
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, d_array));
|
||||
if (attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid dest device pointer in cuda set value.")
|
||||
}
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
int block_size = 256;
|
||||
int num_blocks = (n + block_size - 1) / block_size;
|
||||
|
||||
// Launch the kernel
|
||||
cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
|
||||
n);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
// Launch the kernel
|
||||
cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
|
||||
n);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
/// Explicitly instantiate cuda_set_value_async for 32 and 64 bits
|
||||
@@ -241,22 +243,18 @@ void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
|
||||
|
||||
/// Get the maximum size for the shared memory
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
int max_shared_memory = 0;
|
||||
cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
|
||||
gpu_index);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
#if CUDA_ARCH == 900
|
||||
max_shared_memory = 226000;
|
||||
#elif CUDA_ARCH == 890
|
||||
max_shared_memory = 127000;
|
||||
#elif CUDA_ARCH == 800
|
||||
max_shared_memory = 163000;
|
||||
#elif CUDA_ARCH == 700
|
||||
max_shared_memory = 95000;
|
||||
#endif
|
||||
return max_shared_memory;
|
||||
}
|
||||
|
||||
void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
|
||||
cudaStreamCallback_t callback, void *user_data) {
|
||||
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
check_cuda_error(cudaStreamAddCallback(stream, callback, user_data, 0));
|
||||
}
|
||||
|
||||
void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
|
||||
void *host_pointer) {
|
||||
free(host_pointer);
|
||||
}
|
||||
|
||||
49
backends/tfhe-cuda-backend/cuda/src/integer/addition.cu
Normal file
49
backends/tfhe-cuda-backend/cuda/src/integer/addition.cu
Normal file
@@ -0,0 +1,49 @@
|
||||
#include "integer/addition.cuh"
|
||||
|
||||
void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION
|
||||
: SIGNED_OPERATION::SUBTRACTION;
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_integer_signed_overflowing_add_or_sub_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_signed_overflowing_add_or_sub_memory<uint64_t> **)mem_ptr,
|
||||
num_blocks, op, params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lhs,
|
||||
void *rhs, void *overflowed, int8_t signed_operation, int8_t *mem_ptr,
|
||||
void **bsks, void **ksks, uint32_t num_blocks) {
|
||||
|
||||
auto mem = (int_signed_overflowing_add_or_sub_memory<uint64_t> *)mem_ptr;
|
||||
SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION
|
||||
: SIGNED_OPERATION::SUBTRACTION;
|
||||
|
||||
host_integer_signed_overflowing_add_or_sub_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lhs), static_cast<uint64_t *>(rhs),
|
||||
static_cast<uint64_t *>(overflowed), op, bsks, (uint64_t **)(ksks), mem,
|
||||
num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_signed_overflowing_add_or_sub(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr =
|
||||
(int_signed_overflowing_add_or_sub_memory<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
137
backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
Normal file
137
backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
Normal file
@@ -0,0 +1,137 @@
|
||||
#ifndef TFHE_RS_ADDITION_CUH
|
||||
#define TFHE_RS_ADDITION_CUH
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.h"
|
||||
#include "integer/comparison.cuh"
|
||||
#include "integer/integer.cuh"
|
||||
#include "integer/negation.cuh"
|
||||
#include "integer/scalar_shifts.cuh"
|
||||
#include "linear_algebra.h"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus>
|
||||
void host_resolve_signed_overflow(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *result, Torus *last_block_inner_propagation,
|
||||
Torus *last_block_input_carry, Torus *last_block_output_carry,
|
||||
int_resolve_signed_overflow_memory<Torus> *mem, void **bsks, Torus **ksks) {
|
||||
|
||||
auto x = mem->x;
|
||||
|
||||
Torus *d_clears =
|
||||
(Torus *)cuda_malloc_async(sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
|
||||
cuda_set_value_async<Torus>(streams[0], gpu_indexes[0], d_clears, 2, 1);
|
||||
|
||||
// replace with host function call
|
||||
cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
|
||||
streams[0], gpu_indexes[0], x, last_block_output_carry, d_clears,
|
||||
mem->params.big_lwe_dimension, 1);
|
||||
|
||||
host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
|
||||
last_block_inner_propagation, x, mem->params.big_lwe_dimension,
|
||||
1);
|
||||
host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
|
||||
last_block_inner_propagation, last_block_input_carry,
|
||||
mem->params.big_lwe_dimension, 1);
|
||||
|
||||
host_apply_univariate_lut_kb<Torus>(streams, gpu_indexes, gpu_count, result,
|
||||
last_block_inner_propagation,
|
||||
mem->resolve_overflow_lut, ksks, bsks, 1);
|
||||
|
||||
cuda_drop_async(d_clears, streams[0], gpu_indexes[0]);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_signed_overflowing_add_or_sub_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_signed_overflowing_add_or_sub_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks, SIGNED_OPERATION op, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_signed_overflowing_add_or_sub_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks, op,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
/*
|
||||
* Addition - signed_operation = 1
|
||||
* Subtraction - signed_operation = -1
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_signed_overflowing_add_or_sub_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lhs, Torus *rhs, Torus *overflowed, SIGNED_OPERATION op, void **bsks,
|
||||
uint64_t **ksks,
|
||||
int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
auto radix_params = mem_ptr->params;
|
||||
|
||||
uint32_t big_lwe_dimension = radix_params.big_lwe_dimension;
|
||||
uint32_t big_lwe_size = big_lwe_dimension + 1;
|
||||
uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
assert(radix_params.message_modulus >= 4 && radix_params.carry_modulus >= 4);
|
||||
|
||||
auto result = mem_ptr->result;
|
||||
auto neg_rhs = mem_ptr->neg_rhs;
|
||||
auto input_carries = mem_ptr->input_carries;
|
||||
auto output_carry = mem_ptr->output_carry;
|
||||
auto last_block_inner_propagation = mem_ptr->last_block_inner_propagation;
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(result, lhs, num_blocks * big_lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
// phase 1
|
||||
if (op == SIGNED_OPERATION::ADDITION) {
|
||||
host_addition(streams[0], gpu_indexes[0], result, lhs, rhs,
|
||||
big_lwe_dimension, num_blocks);
|
||||
} else {
|
||||
host_integer_radix_negation(
|
||||
streams, gpu_indexes, gpu_count, neg_rhs, rhs, big_lwe_dimension,
|
||||
num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
|
||||
host_addition(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
|
||||
big_lwe_dimension, num_blocks);
|
||||
}
|
||||
|
||||
// phase 2
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
host_propagate_single_carry(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
|
||||
result, output_carry, input_carries,
|
||||
mem_ptr->scp_mem, bsks, ksks, num_blocks);
|
||||
host_generate_last_block_inner_propagation(
|
||||
mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
|
||||
last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size],
|
||||
&rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, bsks,
|
||||
ksks);
|
||||
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
// phase 3
|
||||
auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size];
|
||||
|
||||
host_resolve_signed_overflow(
|
||||
streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation,
|
||||
input_carry, output_carry, mem_ptr->resolve_overflow_mem, bsks, ksks);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(lhs, result, num_blocks * big_lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
}
|
||||
|
||||
#endif // TFHE_RS_ADDITION_CUH
|
||||
@@ -2,7 +2,6 @@
|
||||
#define CUDA_INTEGER_CMUX_CUH
|
||||
|
||||
#include "integer.cuh"
|
||||
#include <omp.h>
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
@@ -57,27 +56,18 @@ __host__ void host_integer_radix_cmux_kb(
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
// Both sections may be executed in parallel
|
||||
#pragma omp section
|
||||
{
|
||||
auto mem_true = mem_ptr->zero_if_true_buffer;
|
||||
zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
|
||||
lwe_array_true, lwe_condition, mem_true,
|
||||
mem_ptr->inverted_predicate_lut, bsks, ksks,
|
||||
num_radix_blocks);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
auto mem_false = mem_ptr->zero_if_false_buffer;
|
||||
zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
|
||||
lwe_array_false, lwe_condition, mem_false,
|
||||
mem_ptr->predicate_lut, bsks, ksks, num_radix_blocks);
|
||||
}
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
auto mem_true = mem_ptr->zero_if_true_buffer;
|
||||
zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
|
||||
lwe_array_true, lwe_condition, mem_true,
|
||||
mem_ptr->inverted_predicate_lut, bsks, ksks, num_radix_blocks);
|
||||
auto mem_false = mem_ptr->zero_if_false_buffer;
|
||||
zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
|
||||
lwe_array_false, lwe_condition, mem_false, mem_ptr->predicate_lut,
|
||||
bsks, ksks, num_radix_blocks);
|
||||
for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
for (uint j = 0; j < mem_ptr->zero_if_false_buffer->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(false_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
|
||||
@@ -245,7 +245,6 @@ __host__ void host_compare_with_zero_equality(
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
|
||||
@@ -26,54 +26,11 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
|
||||
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
|
||||
|
||||
switch (mem->params.polynomial_size) {
|
||||
case 512:
|
||||
host_integer_div_rem_kb<uint64_t, Degree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
|
||||
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
|
||||
bsks, (uint64_t **)(ksks), mem, num_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
|
||||
host_integer_div_rem_kb<uint64_t, Degree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
|
||||
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
|
||||
bsks, (uint64_t **)(ksks), mem, num_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_div_rem_kb<uint64_t, Degree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
|
||||
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
|
||||
bsks, (uint64_t **)(ksks), mem, num_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_div_rem_kb<uint64_t, Degree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
|
||||
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
|
||||
bsks, (uint64_t **)(ksks), mem, num_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_div_rem_kb<uint64_t, Degree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
|
||||
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
|
||||
bsks, (uint64_t **)(ksks), mem, num_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_div_rem_kb<uint64_t, Degree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
|
||||
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
|
||||
bsks, (uint64_t **)(ksks), mem, num_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer div_rem): unsupported polynomial size. "
|
||||
"Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
|
||||
}
|
||||
host_integer_div_rem_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
|
||||
static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
|
||||
bsks, (uint64_t **)(ksks), mem, num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_div_rem(void **streams, uint32_t *gpu_indexes,
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@@ -31,17 +30,13 @@ template <typename Torus> struct lwe_ciphertext_list {
|
||||
int_radix_params params;
|
||||
|
||||
size_t big_lwe_size;
|
||||
size_t radix_size;
|
||||
size_t big_lwe_size_bytes;
|
||||
size_t radix_size_bytes;
|
||||
size_t big_lwe_dimension;
|
||||
|
||||
lwe_ciphertext_list(Torus *src, int_radix_params params, size_t max_blocks)
|
||||
: data(src), params(params), max_blocks(max_blocks) {
|
||||
big_lwe_size = params.big_lwe_dimension + 1;
|
||||
big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
radix_size = max_blocks * big_lwe_size;
|
||||
radix_size_bytes = radix_size * sizeof(Torus);
|
||||
big_lwe_dimension = params.big_lwe_dimension;
|
||||
len = max_blocks;
|
||||
}
|
||||
@@ -173,7 +168,7 @@ __host__ void scratch_cuda_integer_div_rem_kb(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *quotient, Torus *remainder,
|
||||
@@ -376,35 +371,19 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
#pragma omp section
|
||||
{
|
||||
// interesting_divisor
|
||||
trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
|
||||
gpu_count);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// divisor_ms_blocks
|
||||
trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes,
|
||||
gpu_count);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// interesting_remainder1
|
||||
// numerator_block_stack
|
||||
left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes,
|
||||
gpu_count);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// interesting_remainder2
|
||||
left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
|
||||
gpu_count);
|
||||
}
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
// interesting_divisor
|
||||
trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
|
||||
gpu_count);
|
||||
// divisor_ms_blocks
|
||||
trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
|
||||
// interesting_remainder1
|
||||
// numerator_block_stack
|
||||
left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes,
|
||||
gpu_count);
|
||||
// interesting_remainder2
|
||||
left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
|
||||
gpu_count);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
@@ -439,7 +418,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// `subtraction_overflowed` - single ciphertext
|
||||
auto do_overflowing_sub = [&](cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
host_integer_overflowing_sub_kb<Torus, params>(
|
||||
host_integer_overflowing_sub_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, new_remainder.data,
|
||||
subtraction_overflowed.data, merged_interesting_remainder.data,
|
||||
interesting_divisor.data, bsks, ksks, mem_ptr->overflow_sub_mem,
|
||||
@@ -493,28 +472,15 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
#pragma omp section
|
||||
{
|
||||
// new_remainder
|
||||
// subtraction_overflowed
|
||||
do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// at_least_one_upper_block_is_non_zero
|
||||
check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes,
|
||||
gpu_count);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// cleaned_merged_interesting_remainder
|
||||
create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
|
||||
gpu_indexes, gpu_count);
|
||||
}
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
// new_remainder
|
||||
// subtraction_overflowed
|
||||
do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
|
||||
// at_least_one_upper_block_is_non_zero
|
||||
check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
|
||||
// cleaned_merged_interesting_remainder
|
||||
create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
|
||||
gpu_indexes, gpu_count);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
@@ -571,27 +537,15 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
#pragma omp section
|
||||
{
|
||||
// cleaned_merged_interesting_remainder
|
||||
conditionally_zero_out_merged_interesting_remainder(
|
||||
mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// new_remainder
|
||||
conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2,
|
||||
gpu_indexes, gpu_count);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// quotient
|
||||
set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
|
||||
}
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
// cleaned_merged_interesting_remainder
|
||||
conditionally_zero_out_merged_interesting_remainder(mem_ptr->sub_streams_1,
|
||||
gpu_indexes, gpu_count);
|
||||
// new_remainder
|
||||
conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2,
|
||||
gpu_indexes, gpu_count);
|
||||
// quotient
|
||||
set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
@@ -617,22 +571,13 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
#pragma omp section
|
||||
{
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
|
||||
bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient,
|
||||
bsks, ksks, num_blocks, mem_ptr->message_extract_lut_2);
|
||||
}
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
|
||||
bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
|
||||
ksks, num_blocks, mem_ptr->message_extract_lut_2);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
@@ -19,9 +19,8 @@ void scratch_cuda_full_propagation_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
@@ -29,8 +28,7 @@ void scratch_cuda_full_propagation_64(
|
||||
|
||||
scratch_cuda_full_propagation<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count,
|
||||
(int_fullprop_buffer<uint64_t> **)mem_ptr, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
(int_fullprop_buffer<uint64_t> **)mem_ptr, params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cleanup_cuda_full_propagation(void **streams, uint32_t *gpu_indexes,
|
||||
@@ -175,3 +173,55 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams,
|
||||
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
|
||||
num_radix_blocks, params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
|
||||
void **bsks, uint32_t num_blocks, uint32_t shift) {
|
||||
|
||||
int_radix_params params = ((int_radix_lut<uint64_t> *)mem_ptr)->params;
|
||||
|
||||
host_compute_prefix_sum_hillis_steele<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(output_radix_lwe),
|
||||
static_cast<uint64_t *>(input_radix_lwe), params,
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
void cuda_integer_reverse_blocks_64_inplace(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t num_blocks,
|
||||
uint32_t lwe_size) {
|
||||
|
||||
host_radix_blocks_reverse_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes,
|
||||
static_cast<uint64_t *>(lwe_array), num_blocks, lwe_size);
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "helper_multi_gpu.h"
|
||||
#include "integer.h"
|
||||
#include "integer/scalar_addition.cuh"
|
||||
#include "linear_algebra.h"
|
||||
@@ -10,6 +11,7 @@
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/helper_multi_gpu.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <functional>
|
||||
|
||||
@@ -20,18 +22,19 @@ template <typename Torus>
|
||||
__global__ void radix_blocks_rotate_right(Torus *dst, Torus *src,
|
||||
uint32_t value, uint32_t blocks_count,
|
||||
uint32_t lwe_size) {
|
||||
value %= blocks_count;
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
size_t src_block_id = blockIdx.x;
|
||||
size_t dst_block_id = (src_block_id + value) % blocks_count;
|
||||
size_t stride = blockDim.x;
|
||||
if (tid < lwe_size) {
|
||||
value %= blocks_count;
|
||||
size_t src_block_id = blockIdx.x;
|
||||
size_t dst_block_id = (src_block_id + value) % blocks_count;
|
||||
size_t stride = blockDim.x;
|
||||
|
||||
auto cur_src_block = &src[src_block_id * lwe_size];
|
||||
auto cur_dst_block = &dst[dst_block_id * lwe_size];
|
||||
auto cur_src_block = &src[src_block_id * lwe_size];
|
||||
auto cur_dst_block = &dst[dst_block_id * lwe_size];
|
||||
|
||||
for (size_t i = tid; i < lwe_size; i += stride) {
|
||||
cur_dst_block[i] = cur_src_block[i];
|
||||
for (size_t i = tid; i < lwe_size; i += stride) {
|
||||
cur_dst_block[i] = cur_src_block[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,25 +45,28 @@ template <typename Torus>
|
||||
__global__ void radix_blocks_rotate_left(Torus *dst, Torus *src, uint32_t value,
|
||||
uint32_t blocks_count,
|
||||
uint32_t lwe_size) {
|
||||
value %= blocks_count;
|
||||
size_t src_block_id = blockIdx.x;
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
size_t dst_block_id = (src_block_id >= value)
|
||||
? src_block_id - value
|
||||
: src_block_id - value + blocks_count;
|
||||
size_t stride = blockDim.x;
|
||||
if (tid < lwe_size) {
|
||||
value %= blocks_count;
|
||||
size_t src_block_id = blockIdx.x;
|
||||
|
||||
auto cur_src_block = &src[src_block_id * lwe_size];
|
||||
auto cur_dst_block = &dst[dst_block_id * lwe_size];
|
||||
size_t dst_block_id = (src_block_id >= value)
|
||||
? src_block_id - value
|
||||
: src_block_id - value + blocks_count;
|
||||
size_t stride = blockDim.x;
|
||||
|
||||
for (size_t i = tid; i < lwe_size; i += stride) {
|
||||
cur_dst_block[i] = cur_src_block[i];
|
||||
auto cur_src_block = &src[src_block_id * lwe_size];
|
||||
auto cur_dst_block = &dst[dst_block_id * lwe_size];
|
||||
|
||||
for (size_t i = tid; i < lwe_size; i += stride) {
|
||||
cur_dst_block[i] = cur_src_block[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// rotate radix ciphertext right with specific value
|
||||
// calculation is not inplace, so `dst` and `src` must not be the same
|
||||
// one block is responsible to process single lwe ciphertext
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
@@ -93,6 +99,35 @@ host_radix_blocks_rotate_left(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
dst, src, value, blocks_count, lwe_size);
|
||||
}
|
||||
|
||||
// reverse the blocks in a list
|
||||
// each cuda block swaps a couple of blocks
|
||||
template <typename Torus>
|
||||
__global__ void radix_blocks_reverse_lwe_inplace(Torus *src,
|
||||
uint32_t blocks_count,
|
||||
uint32_t lwe_size) {
|
||||
|
||||
size_t idx = blockIdx.x;
|
||||
size_t rev_idx = blocks_count - 1 - idx;
|
||||
|
||||
for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
|
||||
Torus back_element = src[rev_idx * lwe_size + j];
|
||||
Torus front_element = src[idx * lwe_size + j];
|
||||
src[idx * lwe_size + j] = back_element;
|
||||
src[rev_idx * lwe_size + j] = front_element;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_radix_blocks_reverse_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
Torus *src, uint32_t blocks_count,
|
||||
uint32_t lwe_size) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
int num_blocks = blocks_count / 2, num_threads = 1024;
|
||||
radix_blocks_reverse_lwe_inplace<<<num_blocks, num_threads, 0, streams[0]>>>(
|
||||
src, blocks_count, lwe_size);
|
||||
}
|
||||
|
||||
// polynomial_size threads
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
@@ -153,28 +188,67 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto grouping_factor = params.grouping_factor;
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count,
|
||||
lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes,
|
||||
lwe_array_in, lut->lwe_indexes_in, ksks,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_base_log,
|
||||
ks_level, num_radix_blocks, false);
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
|
||||
std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
|
||||
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lut->lwe_indexes_out,
|
||||
lut->lut_vec, lut->lut_indexes_vec, lut->tmp_lwe_after_ks,
|
||||
lut->lwe_trivial_indexes, bsks, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, 1, 0,
|
||||
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
if (active_gpu_count == 1) {
|
||||
execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], lwe_array_in,
|
||||
lut->lwe_indexes_in, ksks, big_lwe_dimension,
|
||||
small_lwe_dimension, ks_base_log, ks_level,
|
||||
num_radix_blocks);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
|
||||
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, pbs_type);
|
||||
} else {
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
/// With multiple GPUs we push to the vectors on each GPU then when we
|
||||
/// gather data to GPU 0 we can copy back to the original indexing
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_in,
|
||||
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
|
||||
big_lwe_dimension + 1);
|
||||
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec,
|
||||
lwe_array_in_vec, lwe_trivial_indexes_vec,
|
||||
ksks, big_lwe_dimension, small_lwe_dimension,
|
||||
ks_base_log, ks_level, num_radix_blocks);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, pbs_type);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_array_out, lwe_after_pbs_vec,
|
||||
lut->h_lwe_indexes_out,
|
||||
lut->using_trivial_lwe_indexes,
|
||||
num_radix_blocks, big_lwe_dimension + 1);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -205,29 +279,63 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
|
||||
std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
|
||||
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
|
||||
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count,
|
||||
lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes,
|
||||
lwe_array_pbs_in, lut->lwe_indexes_in, ksks,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_base_log,
|
||||
ks_level, num_radix_blocks, false);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lut->lwe_indexes_out,
|
||||
lut->lut_vec, lut->lut_indexes_vec, lut->tmp_lwe_after_ks,
|
||||
lut->lwe_trivial_indexes, bsks, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, 1, 0,
|
||||
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
if (active_gpu_count == 1) {
|
||||
execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], lwe_array_pbs_in,
|
||||
lut->lwe_indexes_in, ksks, big_lwe_dimension,
|
||||
small_lwe_dimension, ks_base_log, ks_level,
|
||||
num_radix_blocks);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
|
||||
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, pbs_type);
|
||||
} else {
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
|
||||
lwe_array_pbs_in, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
|
||||
num_radix_blocks, big_lwe_dimension + 1);
|
||||
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec,
|
||||
lwe_array_in_vec, lwe_trivial_indexes_vec,
|
||||
ksks, big_lwe_dimension, small_lwe_dimension,
|
||||
ks_base_log, ks_level, num_radix_blocks);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, pbs_type);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_array_out, lwe_after_pbs_vec,
|
||||
lut->h_lwe_indexes_out,
|
||||
lut->using_trivial_lwe_indexes,
|
||||
num_radix_blocks, big_lwe_dimension + 1);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -325,7 +433,6 @@ void generate_device_accumulator_bivariate(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, std::function<Torus(Torus, Torus)> f) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
@@ -335,14 +442,14 @@ void generate_device_accumulator_bivariate(
|
||||
message_modulus, carry_modulus, f);
|
||||
|
||||
// copy host lut and lut_indexes_vec to device
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
|
||||
(glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(Torus),
|
||||
stream, gpu_index);
|
||||
|
||||
// Release memory when possible
|
||||
cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
|
||||
h_lut);
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -358,7 +465,6 @@ void generate_device_accumulator_bivariate_with_factor(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, std::function<Torus(Torus, Torus)> f, int factor) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
@@ -368,15 +474,15 @@ void generate_device_accumulator_bivariate_with_factor(
|
||||
h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f,
|
||||
factor);
|
||||
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
// copy host lut and lut_indexes_vec to device
|
||||
cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
|
||||
(glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(Torus),
|
||||
stream, gpu_index);
|
||||
|
||||
// Release memory when possible
|
||||
cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
|
||||
h_lut);
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -394,7 +500,6 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t carry_modulus,
|
||||
std::function<Torus(Torus)> f) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
@@ -403,14 +508,14 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
|
||||
generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f);
|
||||
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
// copy host lut and lut_indexes_vec to device
|
||||
cuda_memcpy_async_to_gpu(
|
||||
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
|
||||
stream, gpu_index);
|
||||
|
||||
// Release memory when possible
|
||||
cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
|
||||
h_lut);
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -424,6 +529,43 @@ void scratch_cuda_propagate_single_carry_kb_inplace(
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_compute_prefix_sum_hillis_steele(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *step_output, Torus *generates_or_propagates, int_radix_params params,
|
||||
int_radix_lut<Torus> *luts, void **bsks, Torus **ksks,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto big_lwe_size = glwe_dimension * polynomial_size + 1;
|
||||
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
int num_steps = ceil(log2((double)num_blocks));
|
||||
int space = 1;
|
||||
cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
|
||||
big_lwe_size_bytes * num_blocks, streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
for (int step = 0; step < num_steps; step++) {
|
||||
if (space > num_blocks - 1)
|
||||
PANIC("Cuda error: step output is going out of bounds in Hillis Steele "
|
||||
"propagation")
|
||||
auto cur_blocks = &step_output[space * big_lwe_size];
|
||||
auto prev_blocks = generates_or_propagates;
|
||||
int cur_total_blocks = num_blocks - space;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
|
||||
bsks, ksks, cur_total_blocks, luts, luts->params.message_modulus);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
&generates_or_propagates[space * big_lwe_size], cur_blocks,
|
||||
big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
|
||||
space *= 2;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array,
|
||||
@@ -448,29 +590,9 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
ksks, num_blocks, luts_array);
|
||||
|
||||
// compute prefix sum with hillis&steele
|
||||
|
||||
int num_steps = ceil(log2((double)num_blocks));
|
||||
int space = 1;
|
||||
cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
|
||||
big_lwe_size_bytes * num_blocks, streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
for (int step = 0; step < num_steps; step++) {
|
||||
auto cur_blocks = &step_output[space * big_lwe_size];
|
||||
auto prev_blocks = generates_or_propagates;
|
||||
int cur_total_blocks = num_blocks - space;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
|
||||
bsks, ksks, cur_total_blocks, luts_carry_propagation_sum,
|
||||
luts_carry_propagation_sum->params.message_modulus);
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
&generates_or_propagates[space * big_lwe_size], cur_blocks,
|
||||
big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
|
||||
space *= 2;
|
||||
}
|
||||
host_compute_prefix_sum_hillis_steele(
|
||||
streams, gpu_indexes, gpu_count, step_output, generates_or_propagates,
|
||||
params, luts_carry_propagation_sum, bsks, ksks, num_blocks);
|
||||
|
||||
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
|
||||
generates_or_propagates, 1, num_blocks,
|
||||
@@ -496,11 +618,24 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
num_blocks, message_acc);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_generate_last_block_inner_propagation(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *last_block_inner_propagation, Torus *lhs, Torus *rhs,
|
||||
int_last_block_inner_propagate_memory<Torus> *mem, void **bsks,
|
||||
Torus **ksks) {
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, last_block_inner_propagation, lhs, rhs,
|
||||
bsks, ksks, 1, mem->last_block_inner_propagation_lut,
|
||||
mem->params.message_modulus);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_propagate_single_sub_borrow(cudaStream_t *streams,
|
||||
uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *overflowed, Torus *lwe_array,
|
||||
int_single_borrow_prop_memory<Torus> *mem,
|
||||
int_overflowing_sub_memory<Torus> *mem,
|
||||
void **bsks, Torus **ksks,
|
||||
uint32_t num_blocks) {
|
||||
auto params = mem->params;
|
||||
@@ -521,27 +656,9 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
|
||||
ksks, num_blocks, luts_array);
|
||||
|
||||
// compute prefix sum with hillis&steele
|
||||
int num_steps = ceil(log2((double)num_blocks));
|
||||
int space = 1;
|
||||
cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
|
||||
big_lwe_size_bytes * num_blocks, streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
for (int step = 0; step < num_steps; step++) {
|
||||
auto cur_blocks = &step_output[space * big_lwe_size];
|
||||
auto prev_blocks = generates_or_propagates;
|
||||
int cur_total_blocks = num_blocks - space;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
|
||||
bsks, ksks, cur_total_blocks, luts_carry_propagation_sum,
|
||||
luts_carry_propagation_sum->params.message_modulus);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
&generates_or_propagates[space * big_lwe_size], cur_blocks,
|
||||
big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
|
||||
space *= 2;
|
||||
}
|
||||
host_compute_prefix_sum_hillis_steele<Torus>(
|
||||
streams, gpu_indexes, gpu_count, step_output, generates_or_propagates,
|
||||
params, luts_carry_propagation_sum, bsks, ksks, num_blocks);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
overflowed, &generates_or_propagates[big_lwe_size * (num_blocks - 1)],
|
||||
@@ -583,12 +700,11 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
for (int i = 0; i < num_blocks; i++) {
|
||||
auto cur_input_block = &input_blocks[i * big_lwe_size];
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
/// Since the keyswitch is done on one input only, use only 1 GPU
|
||||
cuda_keyswitch_lwe_ciphertext_vector<Torus>(
|
||||
streams[0], gpu_indexes[0], mem_ptr->tmp_small_lwe_vector,
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, 1, mem_ptr->tmp_small_lwe_vector,
|
||||
mem_ptr->lut->lwe_trivial_indexes, cur_input_block,
|
||||
mem_ptr->lut->lwe_trivial_indexes, ksks[0], params.big_lwe_dimension,
|
||||
mem_ptr->lut->lwe_trivial_indexes, ksks, params.big_lwe_dimension,
|
||||
params.small_lwe_dimension, params.ks_base_log, params.ks_level, 1);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size],
|
||||
@@ -596,15 +712,14 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
small_lwe_size * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
execute_pbs<Torus>(
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, mem_ptr->tmp_big_lwe_vector,
|
||||
mem_ptr->lut->lwe_trivial_indexes, mem_ptr->lut->lut_vec,
|
||||
mem_ptr->lut->lut_indexes_vec, mem_ptr->tmp_small_lwe_vector,
|
||||
mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
|
||||
params.glwe_dimension, params.small_lwe_dimension,
|
||||
params.polynomial_size, params.pbs_base_log, params.pbs_level,
|
||||
params.grouping_factor, 2, 2, 0,
|
||||
cuda_get_max_shared_memory(gpu_indexes[0]), params.pbs_type);
|
||||
params.grouping_factor, 2, params.pbs_type);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
|
||||
big_lwe_size * sizeof(Torus), streams[0],
|
||||
@@ -625,12 +740,10 @@ void scratch_cuda_full_propagation(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int_fullprop_buffer<Torus> **mem_ptr,
|
||||
int_radix_params params,
|
||||
uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr =
|
||||
new int_fullprop_buffer<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
*mem_ptr = new int_fullprop_buffer<Torus>(streams, gpu_indexes, gpu_count,
|
||||
params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
// (lwe_dimension+1) threads
|
||||
@@ -675,8 +788,9 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
uint32_t lwe_dimension, uint32_t num_radix_blocks,
|
||||
uint32_t factor) {
|
||||
if (num_radix_blocks == 0)
|
||||
return;
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads);
|
||||
|
||||
@@ -71,7 +71,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
|
||||
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
polynomial_size * glwe_dimension, lwe_dimension,
|
||||
@@ -123,7 +123,6 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
* - 'num_blocks' is the number of big lwe ciphertext blocks inside radix
|
||||
* ciphertext
|
||||
* - 'pbs_type' selects which PBS implementation should be used
|
||||
* - 'max_shared_memory' maximum shared memory per cuda block
|
||||
*/
|
||||
void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -133,7 +132,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<256>>(
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
@@ -141,7 +140,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 512:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<512>>(
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
@@ -149,7 +148,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<1024>>(
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
@@ -157,7 +156,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<2048>>(
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
@@ -165,7 +164,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<4096>>(
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
@@ -173,7 +172,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<8192>>(
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
@@ -181,7 +180,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<16384>>(
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
@@ -203,7 +202,7 @@ void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
@@ -216,13 +215,13 @@ void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus);
|
||||
scratch_cuda_integer_sum_ciphertexts_vec_kb<uint64_t>(
|
||||
scratch_cuda_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
|
||||
max_num_radix_in_vec, params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
|
||||
int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix) {
|
||||
@@ -238,42 +237,47 @@ void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
|
||||
switch (mem->params.polynomial_size) {
|
||||
case 512:
|
||||
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<1024>>(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<2048>>(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<4096>>(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<8192>>(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<16384>>(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
|
||||
@@ -287,10 +291,9 @@ void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
free(terms_degree);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_sum_ciphertexts_vec(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr =
|
||||
(int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
|
||||
@@ -8,11 +8,13 @@
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "helper_multi_gpu.h"
|
||||
#include "integer.h"
|
||||
#include "integer/integer.cuh"
|
||||
#include "linear_algebra.h"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/helper_multi_gpu.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
@@ -91,15 +93,11 @@ all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, sharedMemDegree SMD>
|
||||
template <typename Torus>
|
||||
__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
|
||||
uint32_t chunk_size, uint32_t block_size,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
|
||||
Torus *result = (Torus *)sharedmem;
|
||||
|
||||
size_t stride = blockDim.x;
|
||||
size_t chunk_id = blockIdx.x;
|
||||
size_t chunk_elem_size = chunk_size * num_blocks * block_size;
|
||||
@@ -107,10 +105,7 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
|
||||
auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
|
||||
auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
|
||||
size_t block_stride = blockIdx.y * block_size;
|
||||
auto dst_block = &dst_radix[block_stride];
|
||||
|
||||
if constexpr (SMD == NOSM)
|
||||
result = dst_block;
|
||||
auto result = &dst_radix[block_stride];
|
||||
|
||||
// init shared mem with first radix of chunk
|
||||
size_t tid = threadIdx.x;
|
||||
@@ -125,18 +120,12 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
|
||||
result[i] += cur_src_radix[block_stride + i];
|
||||
}
|
||||
}
|
||||
|
||||
// put result from shared mem to global mem
|
||||
if constexpr (SMD == FULLSM)
|
||||
for (int i = tid; i < block_size; i += stride)
|
||||
dst_block[i] = result[i];
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
|
||||
Torus *msb_blocks,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t lsb_count, uint32_t msb_count,
|
||||
uint32_t num_blocks) {
|
||||
size_t big_lwe_dimension = glwe_dimension * params::degree + 1;
|
||||
size_t big_lwe_id = blockIdx.x;
|
||||
@@ -180,38 +169,24 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
|
||||
}
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_sum_ciphertexts_vec_kb(
|
||||
__host__ void scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
|
||||
if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
tree_add_chunks<Torus, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(
|
||||
cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
*mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks_in_radix,
|
||||
max_num_radix_in_vec, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *radix_lwe_out, Torus *terms, int *terms_degree, void **bsks,
|
||||
uint64_t **ksks, int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
|
||||
uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec) {
|
||||
uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec,
|
||||
int_radix_lut<Torus> *reused_lut = nullptr) {
|
||||
|
||||
auto new_blocks = mem_ptr->new_blocks;
|
||||
auto old_blocks = mem_ptr->old_blocks;
|
||||
@@ -223,11 +198,12 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
auto message_modulus = mem_ptr->params.message_modulus;
|
||||
auto carry_modulus = mem_ptr->params.carry_modulus;
|
||||
auto num_blocks = num_blocks_in_radix;
|
||||
auto big_lwe_size = mem_ptr->params.big_lwe_dimension + 1;
|
||||
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
|
||||
auto big_lwe_size = big_lwe_dimension + 1;
|
||||
auto glwe_dimension = mem_ptr->params.glwe_dimension;
|
||||
auto polynomial_size = mem_ptr->params.polynomial_size;
|
||||
auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
|
||||
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
|
||||
auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
|
||||
auto small_lwe_size = small_lwe_dimension + 1;
|
||||
|
||||
if (old_blocks != terms) {
|
||||
cuda_memcpy_async_gpu_to_gpu(old_blocks, terms,
|
||||
@@ -246,7 +222,48 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
int32_t h_smart_copy_in[r * num_blocks];
|
||||
int32_t h_smart_copy_out[r * num_blocks];
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_indexes[0]);
|
||||
/// Here it is important to query the default max shared memory on device 0
|
||||
/// instead of cuda_get_max_shared_memory,
|
||||
/// to avoid bugs with tree_add_chunks trying to use too much shared memory
|
||||
int max_shared_memory = 0;
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
|
||||
|
||||
// create lut object for message and carry
|
||||
// we allocate luts_message_carry in the host function (instead of scratch)
|
||||
// to reduce average memory consumption
|
||||
int_radix_lut<Torus> *luts_message_carry;
|
||||
size_t ch_amount = r / chunk_size;
|
||||
if (!ch_amount)
|
||||
ch_amount++;
|
||||
if (reused_lut == nullptr) {
|
||||
luts_message_carry = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
|
||||
2 * ch_amount * num_blocks, true);
|
||||
} else {
|
||||
luts_message_carry = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
|
||||
2 * ch_amount * num_blocks, reused_lut);
|
||||
}
|
||||
auto message_acc = luts_message_carry->get_lut(gpu_indexes[0], 0);
|
||||
auto carry_acc = luts_message_carry->get_lut(gpu_indexes[0], 1);
|
||||
|
||||
// define functions for each accumulator
|
||||
auto lut_f_message = [message_modulus](Torus x) -> Torus {
|
||||
return x % message_modulus;
|
||||
};
|
||||
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
|
||||
return x / message_modulus;
|
||||
};
|
||||
|
||||
// generate accumulators
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], message_acc, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, lut_f_message);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], carry_acc, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, lut_f_carry);
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
while (r > 2) {
|
||||
size_t cur_total_blocks = r * num_blocks;
|
||||
@@ -257,12 +274,8 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
size_t sm_size = big_lwe_size * sizeof(Torus);
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
if (sm_size < max_shared_memory)
|
||||
tree_add_chunks<Torus, FULLSM><<<add_grid, 512, sm_size, streams[0]>>>(
|
||||
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
|
||||
else
|
||||
tree_add_chunks<Torus, NOSM><<<add_grid, 512, 0, streams[0]>>>(
|
||||
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
|
||||
tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
|
||||
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
|
||||
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
@@ -275,46 +288,21 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
terms_degree, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
|
||||
h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max,
|
||||
total_count, message_count, carry_count, sm_copy_count);
|
||||
|
||||
// create lut object for message and carry
|
||||
// we allocate luts_message_carry in the host function (instead of scratch)
|
||||
// to reduce average memory consumption
|
||||
auto luts_message_carry = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->params, 2, total_count, true);
|
||||
|
||||
auto message_acc = luts_message_carry->get_lut(gpu_indexes[0], 0);
|
||||
auto carry_acc = luts_message_carry->get_lut(gpu_indexes[0], 1);
|
||||
|
||||
// define functions for each accumulator
|
||||
auto lut_f_message = [message_modulus](Torus x) -> Torus {
|
||||
return x % message_modulus;
|
||||
};
|
||||
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
|
||||
return x / message_modulus;
|
||||
};
|
||||
|
||||
// generate accumulators
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], message_acc, glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, lut_f_message);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], carry_acc, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, lut_f_carry);
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
|
||||
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
|
||||
luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
|
||||
h_lwe_idx_in, h_lwe_idx_out);
|
||||
|
||||
size_t copy_size = total_count * sizeof(Torus);
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_idx_in, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_idx_out, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
copy_size = sm_copy_count * sizeof(int32_t);
|
||||
size_t copy_size = sm_copy_count * sizeof(int32_t);
|
||||
cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
// inside d_smart_copy_in there are only -1 values
|
||||
// it's fine to call smart_copy with same pointer
|
||||
// as source and destination
|
||||
smart_copy<<<sm_copy_count, 1024, 0, streams[0]>>>(
|
||||
new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
|
||||
big_lwe_size);
|
||||
@@ -328,28 +316,97 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
/// After this keyswitch execution, we need to synchronize the streams
|
||||
/// because the keyswitch and PBS do not operate on the same number of
|
||||
/// inputs
|
||||
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, small_lwe_vector,
|
||||
lwe_indexes_in, new_blocks, lwe_indexes_in, ksks,
|
||||
polynomial_size * glwe_dimension, lwe_dimension,
|
||||
mem_ptr->params.ks_base_log,
|
||||
mem_ptr->params.ks_level, message_count, true);
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
|
||||
std::vector<Torus *> small_lwe_vector_vec =
|
||||
luts_message_carry->lwe_after_ks_vec;
|
||||
std::vector<Torus *> lwe_after_pbs_vec =
|
||||
luts_message_carry->lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec =
|
||||
luts_message_carry->lwe_trivial_indexes_vec;
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs<Torus>(streams, gpu_indexes, gpu_count, new_blocks,
|
||||
lwe_indexes_out, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, small_lwe_vector,
|
||||
lwe_indexes_in, bsks, luts_message_carry->buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size,
|
||||
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
|
||||
mem_ptr->params.grouping_factor, total_count, 2, 0,
|
||||
max_shared_memory, mem_ptr->params.pbs_type, true);
|
||||
luts_message_carry->release(streams, gpu_indexes, gpu_count);
|
||||
auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
|
||||
if (active_gpu_count == 1) {
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
/// After this keyswitch execution, we need to synchronize the streams
|
||||
/// because the keyswitch and PBS do not operate on the same number of
|
||||
/// inputs
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, 1, small_lwe_vector, lwe_indexes_in, new_blocks,
|
||||
lwe_indexes_in, ksks, polynomial_size * glwe_dimension,
|
||||
small_lwe_dimension, mem_ptr->params.ks_base_log,
|
||||
mem_ptr->params.ks_level, message_count);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, new_blocks, lwe_indexes_out,
|
||||
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
|
||||
small_lwe_vector, lwe_indexes_in, bsks, luts_message_carry->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size,
|
||||
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
|
||||
mem_ptr->params.grouping_factor, total_count,
|
||||
mem_ptr->params.pbs_type);
|
||||
} else {
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, new_blocks_vec, new_blocks,
|
||||
luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->using_trivial_lwe_indexes, message_count,
|
||||
big_lwe_size);
|
||||
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
/// After this keyswitch execution, we need to synchronize the streams
|
||||
/// because the keyswitch and PBS do not operate on the same number of
|
||||
/// inputs
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
|
||||
lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
|
||||
ksks, big_lwe_dimension, small_lwe_dimension,
|
||||
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);
|
||||
|
||||
/// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
|
||||
/// different configuration
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, gpu_count, small_lwe_vector,
|
||||
small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->using_trivial_lwe_indexes, message_count,
|
||||
small_lwe_size);
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
|
||||
small_lwe_vector, luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->using_trivial_lwe_indexes, total_count,
|
||||
small_lwe_size);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
|
||||
lwe_trivial_indexes_vec, bsks, luts_message_carry->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size,
|
||||
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
|
||||
mem_ptr->params.grouping_factor, total_count,
|
||||
mem_ptr->params.pbs_type);
|
||||
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, new_blocks, lwe_after_pbs_vec,
|
||||
luts_message_carry->h_lwe_indexes_out,
|
||||
luts_message_carry->using_trivial_lwe_indexes, total_count,
|
||||
big_lwe_size);
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0;
|
||||
int new_blocks_created = 2 * ch_amount * num_blocks;
|
||||
@@ -362,17 +419,15 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
std::swap(new_blocks, old_blocks);
|
||||
r = (new_blocks_created + rem_blocks) / num_blocks;
|
||||
}
|
||||
luts_message_carry->release(streams, gpu_indexes, gpu_count);
|
||||
delete (luts_message_carry);
|
||||
|
||||
host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
|
||||
&old_blocks[num_blocks * big_lwe_size], big_lwe_dimension,
|
||||
num_blocks);
|
||||
|
||||
host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
|
||||
radix_lwe_out, nullptr, nullptr,
|
||||
mem_ptr->scp_mem, bsks, ksks, num_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, class params>
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_integer_mult_radix_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
uint64_t *radix_lwe_out, uint64_t *radix_lwe_left,
|
||||
@@ -464,8 +519,7 @@ __host__ void host_integer_mult_radix_kb(
|
||||
fill_radix_from_lsb_msb<Torus, params>
|
||||
<<<num_blocks * num_blocks, params::degree / params::opt, 0,
|
||||
streams[0]>>>(vector_result_sb, vector_result_lsb, vector_result_msb,
|
||||
glwe_dimension, lsb_vector_block_count,
|
||||
msb_vector_block_count, num_blocks);
|
||||
glwe_dimension, num_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
int terms_degree[2 * num_blocks * num_blocks];
|
||||
@@ -481,10 +535,15 @@ __host__ void host_integer_mult_radix_kb(
|
||||
terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
|
||||
}
|
||||
|
||||
host_integer_sum_ciphertexts_vec_kb<Torus, params>(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus, params>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb,
|
||||
terms_degree, bsks, ksks, mem_ptr->sum_ciphertexts_mem, num_blocks,
|
||||
2 * num_blocks);
|
||||
2 * num_blocks, mem_ptr->luts_array);
|
||||
|
||||
auto scp_mem_ptr = mem_ptr->sum_ciphertexts_mem->scp_mem;
|
||||
host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
|
||||
radix_lwe_out, nullptr, nullptr,
|
||||
scp_mem_ptr, bsks, ksks, num_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -492,22 +551,6 @@ __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_mul_memory<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
|
||||
if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
tree_add_chunks<Torus, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(
|
||||
cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
*mem_ptr = new int_mul_memory<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
@@ -38,65 +38,13 @@ void cuda_integer_radix_overflowing_sub_kb_64(
|
||||
|
||||
auto mem = (int_overflowing_sub_memory<uint64_t> *)mem_ptr;
|
||||
|
||||
switch (mem->params.polynomial_size) {
|
||||
case 512:
|
||||
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_overflowed),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
|
||||
mem, num_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_overflowed),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
|
||||
mem, num_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_overflowed),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
|
||||
mem, num_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_overflowed),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
|
||||
mem, num_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_overflowed),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
|
||||
mem, num_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_overflowed),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
|
||||
mem, num_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer overflowing sub): unsupported polynomial size. "
|
||||
"Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
|
||||
}
|
||||
host_integer_overflowing_sub_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_overflowed),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks), mem,
|
||||
num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_overflowing_sub(void **streams,
|
||||
|
||||
@@ -98,7 +98,7 @@ __host__ void scratch_cuda_integer_overflowing_sub_kb(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_overflowing_sub_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *radix_lwe_out, Torus *radix_lwe_overflowed, Torus *radix_lwe_left,
|
||||
@@ -113,9 +113,9 @@ __host__ void host_integer_overflowing_sub_kb(
|
||||
radix_params.message_modulus, radix_params.carry_modulus,
|
||||
radix_params.message_modulus - 1);
|
||||
|
||||
host_propagate_single_sub_borrow<Torus>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_overflowed, radix_lwe_out,
|
||||
mem_ptr->borrow_prop_mem, bsks, ksks, num_blocks);
|
||||
host_propagate_single_sub_borrow<Torus>(streams, gpu_indexes, gpu_count,
|
||||
radix_lwe_overflowed, radix_lwe_out,
|
||||
mem_ptr, bsks, ksks, num_blocks);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
#define CUDA_INTEGER_SCALAR_COMPARISON_OPS_CUH
|
||||
|
||||
#include "integer/comparison.cuh"
|
||||
#include <omp.h>
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
@@ -87,53 +86,43 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
// Both sections may be executed in parallel
|
||||
#pragma omp section
|
||||
{
|
||||
//////////////
|
||||
// lsb
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
//////////////
|
||||
// lsb
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
total_num_scalar_blocks, message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
total_num_scalar_blocks, message_modulus);
|
||||
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
num_lsb_radix_blocks += (total_num_scalar_blocks % 2);
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
num_lsb_radix_blocks += (total_num_scalar_blocks % 2);
|
||||
|
||||
// comparisons will be assigned
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
// comparisons will be assigned
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
|
||||
comparisons, lhs, rhs, mem_ptr, bsks,
|
||||
ksks, num_lsb_radix_blocks);
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
|
||||
comparisons, lhs, rhs, mem_ptr, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(
|
||||
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks,
|
||||
ksks, num_lsb_radix_blocks);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
//////////////
|
||||
// msb
|
||||
host_compare_with_zero_equality(
|
||||
msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb,
|
||||
mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
}
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
mem_ptr->identity_lut_f, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
//////////////
|
||||
// msb
|
||||
host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
|
||||
lwe_array_msb_out, msb, mem_ptr, bsks, ksks,
|
||||
num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
@@ -205,7 +194,6 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
|
||||
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -311,93 +299,83 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
// Both sections may be executed in parallel
|
||||
#pragma omp section
|
||||
{
|
||||
//////////////
|
||||
// lsb
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
//////////////
|
||||
// lsb
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
total_num_scalar_blocks, message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
total_num_scalar_blocks, message_modulus);
|
||||
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
num_lsb_radix_blocks += (total_num_scalar_blocks % 2);
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
num_lsb_radix_blocks += (total_num_scalar_blocks % 2);
|
||||
|
||||
// comparisons will be assigned
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
// comparisons will be assigned
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
|
||||
comparisons, lhs, rhs, mem_ptr, bsks,
|
||||
ksks, num_lsb_radix_blocks);
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
|
||||
comparisons, lhs, rhs, mem_ptr, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(
|
||||
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks,
|
||||
ksks, num_lsb_radix_blocks);
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
mem_ptr->identity_lut_f, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
//////////////
|
||||
// msb
|
||||
// We remove the last block (which is the sign)
|
||||
Torus *are_all_msb_zeros = lwe_array_msb_out;
|
||||
host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
|
||||
are_all_msb_zeros, msb, mem_ptr, bsks, ksks,
|
||||
num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
|
||||
auto sign_bit_pos = (int)log2(message_modulus) - 1;
|
||||
|
||||
auto lut_f = [mem_ptr, sign_bit_pos](Torus sign_block,
|
||||
Torus msb_are_zeros) {
|
||||
bool sign_bit_is_set = (sign_block >> sign_bit_pos) == 1;
|
||||
CMP_ORDERING sign_block_ordering;
|
||||
if (sign_bit_is_set) {
|
||||
sign_block_ordering = CMP_ORDERING::IS_INFERIOR;
|
||||
} else if (sign_block != 0) {
|
||||
sign_block_ordering = CMP_ORDERING::IS_SUPERIOR;
|
||||
} else {
|
||||
sign_block_ordering = CMP_ORDERING::IS_EQUAL;
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
//////////////
|
||||
// msb
|
||||
// We remove the last block (which is the sign)
|
||||
Torus *are_all_msb_zeros = lwe_array_msb_out;
|
||||
host_compare_with_zero_equality(
|
||||
msb_streams, gpu_indexes, gpu_count, are_all_msb_zeros, msb,
|
||||
mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
|
||||
auto sign_bit_pos = (int)log2(message_modulus) - 1;
|
||||
CMP_ORDERING msb_ordering;
|
||||
if (msb_are_zeros == 1)
|
||||
msb_ordering = CMP_ORDERING::IS_EQUAL;
|
||||
else
|
||||
msb_ordering = CMP_ORDERING::IS_SUPERIOR;
|
||||
|
||||
auto lut_f = [mem_ptr, sign_bit_pos](Torus sign_block,
|
||||
Torus msb_are_zeros) {
|
||||
bool sign_bit_is_set = (sign_block >> sign_bit_pos) == 1;
|
||||
CMP_ORDERING sign_block_ordering;
|
||||
if (sign_bit_is_set) {
|
||||
sign_block_ordering = CMP_ORDERING::IS_INFERIOR;
|
||||
} else if (sign_block != 0) {
|
||||
sign_block_ordering = CMP_ORDERING::IS_SUPERIOR;
|
||||
} else {
|
||||
sign_block_ordering = CMP_ORDERING::IS_EQUAL;
|
||||
}
|
||||
return mem_ptr->diff_buffer->tree_buffer->block_selector_f(
|
||||
sign_block_ordering, msb_ordering);
|
||||
};
|
||||
|
||||
CMP_ORDERING msb_ordering;
|
||||
if (msb_are_zeros == 1)
|
||||
msb_ordering = CMP_ORDERING::IS_EQUAL;
|
||||
else
|
||||
msb_ordering = CMP_ORDERING::IS_SUPERIOR;
|
||||
auto signed_msb_lut = mem_ptr->signed_msb_lut;
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
msb_streams[0], gpu_indexes[0],
|
||||
signed_msb_lut->get_lut(gpu_indexes[0], 0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f);
|
||||
signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
return mem_ptr->diff_buffer->tree_buffer->block_selector_f(
|
||||
sign_block_ordering, msb_ordering);
|
||||
};
|
||||
|
||||
auto signed_msb_lut = mem_ptr->signed_msb_lut;
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
msb_streams[0], gpu_indexes[0],
|
||||
signed_msb_lut->get_lut(gpu_indexes[0], 0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f);
|
||||
signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
|
||||
are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
|
||||
signed_msb_lut->params.message_modulus);
|
||||
}
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
|
||||
are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
|
||||
signed_msb_lut->params.message_modulus);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
@@ -422,50 +400,38 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
auto lwe_array_ct_out = mem_ptr->tmp_lwe_array_out;
|
||||
auto lwe_array_sign_out =
|
||||
lwe_array_ct_out + (num_lsb_radix_blocks / 2) * big_lwe_size;
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
// Both sections may be executed in parallel
|
||||
#pragma omp section
|
||||
{
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks - 1,
|
||||
message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
num_lsb_radix_blocks - 1, message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks - 1, message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
num_lsb_radix_blocks - 1, message_modulus);
|
||||
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
|
||||
// comparisons will be assigned
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
|
||||
lwe_array_ct_out, lhs, rhs, mem_ptr,
|
||||
bsks, ksks, num_lsb_radix_blocks);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
Torus *encrypted_sign_block =
|
||||
lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
|
||||
Torus *scalar_sign_block =
|
||||
scalar_blocks + (total_num_scalar_blocks - 1);
|
||||
// comparisons will be assigned
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
|
||||
lwe_array_ct_out, lhs, rhs, mem_ptr, bsks,
|
||||
ksks, num_lsb_radix_blocks);
|
||||
Torus *encrypted_sign_block =
|
||||
lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
|
||||
Torus *scalar_sign_block = scalar_blocks + (total_num_scalar_blocks - 1);
|
||||
|
||||
auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
|
||||
create_trivial_radix(msb_streams[0], gpu_indexes[0], trivial_sign_block,
|
||||
scalar_sign_block, big_lwe_dimension, 1, 1,
|
||||
message_modulus, carry_modulus);
|
||||
auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
|
||||
create_trivial_radix(msb_streams[0], gpu_indexes[0], trivial_sign_block,
|
||||
scalar_sign_block, big_lwe_dimension, 1, 1,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out,
|
||||
encrypted_sign_block, trivial_sign_block, bsks, ksks, 1,
|
||||
mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
|
||||
}
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out,
|
||||
encrypted_sign_block, trivial_sign_block, bsks, ksks, 1,
|
||||
mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
@@ -566,6 +532,8 @@ __host__ void scalar_compare_radix_blocks_kb(
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
if (num_radix_blocks == 0)
|
||||
return;
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
@@ -686,58 +654,47 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
auto lsb_streams = mem_ptr->lsb_streams;
|
||||
auto msb_streams = mem_ptr->msb_streams;
|
||||
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
// Both sections may be executed in parallel
|
||||
#pragma omp section
|
||||
{
|
||||
if (num_halved_scalar_blocks > 0) {
|
||||
auto packed_blocks = mem_ptr->tmp_packed_input;
|
||||
auto packed_scalar =
|
||||
packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;
|
||||
if (num_halved_scalar_blocks > 0) {
|
||||
auto packed_blocks = mem_ptr->tmp_packed_input;
|
||||
auto packed_scalar =
|
||||
packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;
|
||||
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
|
||||
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], packed_scalar,
|
||||
scalar_blocks, 0, num_scalar_blocks, message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
|
||||
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], packed_scalar, scalar_blocks, 0,
|
||||
num_scalar_blocks, message_modulus);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0),
|
||||
packed_scalar, num_halved_scalar_blocks * sizeof(Torus),
|
||||
lsb_streams[0], gpu_indexes[0]);
|
||||
scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0),
|
||||
packed_scalar, num_halved_scalar_blocks * sizeof(Torus), lsb_streams[0],
|
||||
gpu_indexes[0]);
|
||||
scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
|
||||
packed_blocks, bsks, ksks, num_halved_lsb_radix_blocks,
|
||||
scalar_comparison_luts);
|
||||
}
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, packed_blocks,
|
||||
bsks, ksks, num_halved_lsb_radix_blocks, scalar_comparison_luts);
|
||||
}
|
||||
//////////////
|
||||
// msb
|
||||
if (num_msb_radix_blocks > 0) {
|
||||
int_radix_lut<Torus> *msb_lut;
|
||||
switch (mem_ptr->op) {
|
||||
case COMPARISON_TYPE::EQ:
|
||||
msb_lut = mem_ptr->is_zero_lut;
|
||||
break;
|
||||
case COMPARISON_TYPE::NE:
|
||||
msb_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
//////////////
|
||||
// msb
|
||||
if (num_msb_radix_blocks > 0) {
|
||||
int_radix_lut<Torus> *msb_lut;
|
||||
switch (mem_ptr->op) {
|
||||
case COMPARISON_TYPE::EQ:
|
||||
msb_lut = mem_ptr->is_zero_lut;
|
||||
break;
|
||||
case COMPARISON_TYPE::NE:
|
||||
msb_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
}
|
||||
|
||||
host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
|
||||
lwe_array_msb_out, msb, mem_ptr, bsks,
|
||||
ksks, num_msb_radix_blocks, msb_lut);
|
||||
}
|
||||
}
|
||||
host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
|
||||
lwe_array_msb_out, msb, mem_ptr, bsks, ksks,
|
||||
num_msb_radix_blocks, msb_lut);
|
||||
}
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
@@ -33,22 +33,6 @@ __host__ void scratch_cuda_integer_radix_scalar_mul_kb(
|
||||
int_scalar_mul_buffer<T> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(T);
|
||||
if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
tree_add_chunks<T, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
sm_size));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<T, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(
|
||||
cudaFuncSetAttribute(tree_add_chunks<T, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<T, NOSM>, cudaFuncCachePreferL1);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
*mem_ptr =
|
||||
new int_scalar_mul_buffer<T>(streams, gpu_indexes, gpu_count, params,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
@@ -108,6 +92,10 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
}
|
||||
}
|
||||
|
||||
cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
|
||||
mem->logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
|
||||
delete (mem->logical_scalar_shift_buffer);
|
||||
|
||||
if (j == 0) {
|
||||
// lwe array = 0
|
||||
cuda_memset_async(lwe_array, 0, num_radix_blocks * lwe_size_bytes,
|
||||
@@ -117,10 +105,15 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
for (int i = 0; i < j * num_radix_blocks; i++) {
|
||||
terms_degree[i] = message_modulus - 1;
|
||||
}
|
||||
host_integer_sum_ciphertexts_vec_kb<T, params>(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer,
|
||||
terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem,
|
||||
num_radix_blocks, j);
|
||||
|
||||
auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem;
|
||||
host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count, lwe_array,
|
||||
nullptr, nullptr, scp_mem_ptr, bsks, ksks,
|
||||
num_radix_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -49,8 +49,6 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
|
||||
Torus *rotated_buffer = mem->tmp_rotated;
|
||||
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
// rotate right all the blocks in radix ciphertext
|
||||
// copy result in new buffer
|
||||
// 256 threads are used in every block
|
||||
@@ -76,6 +74,8 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
giver_blocks, lwe_array, 1, num_blocks,
|
||||
big_lwe_size);
|
||||
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
|
||||
giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
|
||||
@@ -100,6 +100,8 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, giver_blocks,
|
||||
lwe_array, 1, num_blocks, big_lwe_size);
|
||||
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
|
||||
giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
#include "types/complex/operations.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <omp.h>
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_logical_scalar_shift_kb(
|
||||
@@ -52,13 +51,6 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
Torus *full_rotated_buffer = mem->tmp_rotated;
|
||||
Torus *rotated_buffer = &full_rotated_buffer[big_lwe_size];
|
||||
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
// rotate right all the blocks in radix ciphertext
|
||||
// copy result in new buffer
|
||||
// 1024 threads are used in every block
|
||||
// block_count blocks will be used in the grid
|
||||
// one block is responsible to process single lwe ciphertext
|
||||
if (mem->shift_type == LEFT_SHIFT) {
|
||||
// rotate right as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
|
||||
@@ -76,6 +68,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
return;
|
||||
}
|
||||
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
auto partial_current_blocks = &lwe_array[rotations * big_lwe_size];
|
||||
auto partial_previous_blocks =
|
||||
&full_rotated_buffer[rotations * big_lwe_size];
|
||||
@@ -109,6 +102,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
|
||||
auto partial_current_blocks = lwe_array;
|
||||
auto partial_next_blocks = &rotated_buffer[big_lwe_size];
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
size_t partial_block_count = num_blocks - rotations;
|
||||
|
||||
@@ -139,8 +133,6 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
int_arithmetic_scalar_shift_buffer<Torus> *mem, void **bsks, Torus **ksks,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
@@ -160,15 +152,9 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
size_t shift_within_block = shift % num_bits_in_block;
|
||||
|
||||
Torus *rotated_buffer = mem->tmp_rotated;
|
||||
Torus *padding_block = &rotated_buffer[num_blocks * big_lwe_size];
|
||||
Torus *padding_block = &rotated_buffer[(num_blocks + 1) * big_lwe_size];
|
||||
Torus *last_block_copy = &padding_block[big_lwe_size];
|
||||
|
||||
auto lut_univariate_shift_last_block =
|
||||
mem->lut_buffers_univariate[shift_within_block - 1];
|
||||
auto lut_univariate_padding_block =
|
||||
mem->lut_buffers_univariate[num_bits_in_block - 1];
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
if (mem->shift_type == RIGHT_SHIFT) {
|
||||
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
@@ -197,59 +183,59 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
return;
|
||||
}
|
||||
|
||||
// In the arithmetic shift case we have to pad with the value of the sign
|
||||
// bit. This creates the need for a different shifting lut than in the
|
||||
// logical shift case. We also need another PBS to create the padding block.
|
||||
Torus *last_block = lwe_array + (num_blocks - rotations - 1) * big_lwe_size;
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
last_block_copy,
|
||||
rotated_buffer + (num_blocks - rotations - 1) * big_lwe_size,
|
||||
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
auto partial_current_blocks = lwe_array;
|
||||
auto partial_next_blocks = &rotated_buffer[big_lwe_size];
|
||||
size_t partial_block_count = num_blocks - rotations;
|
||||
if (shift_within_block != 0 && rotations != num_blocks) {
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, partial_current_blocks,
|
||||
partial_current_blocks, partial_next_blocks, bsks, ksks,
|
||||
partial_block_count, lut_bivariate,
|
||||
lut_bivariate->params.message_modulus);
|
||||
}
|
||||
// Since our CPU threads will be working on different streams we shall
|
||||
// assert the work in the main stream is completed
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
// All sections may be executed in parallel
|
||||
#pragma omp section
|
||||
{
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
mem->local_streams_1, gpu_indexes, gpu_count, padding_block,
|
||||
last_block_copy, bsks, ksks, 1, lut_univariate_padding_block);
|
||||
// Replace blocks 'pulled' from the left with the correct padding block
|
||||
for (uint i = 0; i < rotations; i++) {
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
lwe_array + (num_blocks - rotations + i) * big_lwe_size,
|
||||
padding_block, big_lwe_size_bytes, mem->local_streams_1[0],
|
||||
gpu_indexes[0]);
|
||||
}
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
if (shift_within_block != 0 && rotations != num_blocks) {
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
mem->local_streams_2, gpu_indexes, gpu_count, last_block,
|
||||
last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
if (num_blocks != rotations) {
|
||||
// In the arithmetic shift case we have to pad with the value of the sign
|
||||
// bit. This creates the need for a different shifting lut than in the
|
||||
// logical shift case. We also need another PBS to create the padding
|
||||
// block.
|
||||
Torus *last_block =
|
||||
lwe_array + (num_blocks - rotations - 1) * big_lwe_size;
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
last_block_copy,
|
||||
rotated_buffer + (num_blocks - rotations - 1) * big_lwe_size,
|
||||
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
if (shift_within_block != 0) {
|
||||
auto partial_current_blocks = lwe_array;
|
||||
auto partial_next_blocks = &rotated_buffer[big_lwe_size];
|
||||
size_t partial_block_count = num_blocks - rotations;
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, partial_current_blocks,
|
||||
partial_current_blocks, partial_next_blocks, bsks, ksks,
|
||||
partial_block_count, lut_bivariate,
|
||||
lut_bivariate->params.message_modulus);
|
||||
}
|
||||
// Since our CPU threads will be working on different streams we shall
|
||||
// assert the work in the main stream is completed
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
auto lut_univariate_padding_block =
|
||||
mem->lut_buffers_univariate[num_bits_in_block - 1];
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
mem->local_streams_1, gpu_indexes, gpu_count, padding_block,
|
||||
last_block_copy, bsks, ksks, 1, lut_univariate_padding_block);
|
||||
// Replace blocks 'pulled' from the left with the correct padding
|
||||
// block
|
||||
for (uint i = 0; i < rotations; i++) {
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_array + (num_blocks - rotations + i) *
|
||||
big_lwe_size,
|
||||
padding_block, big_lwe_size_bytes,
|
||||
mem->local_streams_1[0], gpu_indexes[0]);
|
||||
}
|
||||
if (shift_within_block != 0) {
|
||||
auto lut_univariate_shift_last_block =
|
||||
mem->lut_buffers_univariate[shift_within_block - 1];
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
mem->local_streams_2, gpu_indexes, gpu_count, last_block,
|
||||
last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block);
|
||||
}
|
||||
for (uint j = 0; j < mem->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
PANIC("Cuda error (scalar shift): left scalar shift is never of the "
|
||||
"arithmetic type")
|
||||
|
||||
@@ -37,8 +37,6 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
auto big_lwe_size = big_lwe_dimension + 1;
|
||||
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
|
||||
// Extract all bits
|
||||
auto bits = mem->tmp_bits;
|
||||
extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, bits, lwe_array, bsks,
|
||||
|
||||
@@ -36,6 +36,18 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
|
||||
}
|
||||
|
||||
// We need these lines so the compiler knows how to specialize these functions
|
||||
template __device__ const uint64_t *
|
||||
get_ith_mask_kth_block(const uint64_t *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size, int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
template __device__ const uint32_t *
|
||||
get_ith_mask_kth_block(const uint32_t *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size, int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
template __device__ const double2 *
|
||||
get_ith_mask_kth_block(const double2 *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size, int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i,
|
||||
int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
@@ -51,6 +63,7 @@ template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
|
||||
template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i,
|
||||
int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
@@ -67,10 +80,12 @@ template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
|
||||
template __device__ uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
uint64_t *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
|
||||
template __device__ const uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
const uint64_t *ptr, int g, int i, int k, int level,
|
||||
uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension,
|
||||
uint32_t level_count);
|
||||
|
||||
template __device__ double2 *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
double2 *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
|
||||
template __device__ const double2 *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
const double2 *ptr, int g, int i, int k, int level,
|
||||
uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension,
|
||||
uint32_t level_count);
|
||||
|
||||
@@ -17,6 +17,18 @@ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////
|
||||
template <typename T>
|
||||
__device__ const T *get_ith_mask_kth_block(const T *ptr, int i, int k,
|
||||
int level, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count) {
|
||||
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
|
||||
level_count) +
|
||||
level * polynomial_size / 2 * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * (glwe_dimension + 1)];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
@@ -27,7 +39,6 @@ __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
|
||||
(glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * (glwe_dimension + 1)];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
@@ -50,14 +61,16 @@ __device__ inline int get_start_ith_lwe(uint32_t i, uint32_t grouping_factor,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
|
||||
__device__ const T *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
const T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count) {
|
||||
T *ptr_group = ptr + get_start_ith_lwe(i, grouping_factor, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
const T *ptr_group =
|
||||
ptr + get_start_ith_lwe(i, grouping_factor, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
return get_ith_mask_kth_block(ptr_group, g, k, level, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////
|
||||
template <typename T, typename ST>
|
||||
void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
@@ -77,7 +90,8 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
int gridSize = total_polynomials;
|
||||
int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
|
||||
|
||||
double2 *h_bsk = (double2 *)malloc(buffer_size);
|
||||
double2 *h_bsk;
|
||||
cudaMallocHost((void **)&h_bsk, buffer_size);
|
||||
|
||||
double2 *d_bsk = (double2 *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
|
||||
@@ -101,7 +115,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
double2 *buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -119,7 +133,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
}
|
||||
break;
|
||||
case 512:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -137,7 +151,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
}
|
||||
break;
|
||||
case 1024:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -155,7 +169,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
}
|
||||
break;
|
||||
case 2048:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -173,7 +187,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
}
|
||||
break;
|
||||
case 4096:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -191,7 +205,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
}
|
||||
break;
|
||||
case 8192:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -209,7 +223,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
}
|
||||
break;
|
||||
case 16384:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -233,7 +247,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
|
||||
cuda_drop_async(d_bsk, stream, gpu_index);
|
||||
cuda_drop_async(buffer, stream, gpu_index);
|
||||
free(h_bsk);
|
||||
cudaFreeHost(h_bsk);
|
||||
}
|
||||
|
||||
void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
@@ -254,7 +268,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
double2 *buffer;
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
|
||||
@@ -275,7 +289,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
}
|
||||
break;
|
||||
case 512:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
|
||||
@@ -296,7 +310,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
}
|
||||
break;
|
||||
case 1024:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
|
||||
@@ -317,7 +331,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
}
|
||||
break;
|
||||
case 2048:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
|
||||
@@ -338,7 +352,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
}
|
||||
break;
|
||||
case 4096:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
|
||||
@@ -359,7 +373,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
}
|
||||
break;
|
||||
case 8192:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
|
||||
@@ -380,7 +394,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
}
|
||||
break;
|
||||
case 16384:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
#ifndef CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
|
||||
#define CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "helper_multi_gpu.h"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "programmable_bootstrap_multibit.h"
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
#include "helper_multi_gpu.h"
|
||||
|
||||
using namespace cooperative_groups;
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
@@ -22,11 +21,11 @@ get_join_buffer_element(int level_id, int glwe_id, G &group,
|
||||
uint32_t glwe_dimension, bool support_dsm);
|
||||
|
||||
template <typename Torus, typename G, class params>
|
||||
__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
|
||||
double2 *join_buffer, double2 *bootstrapping_key,
|
||||
int polynomial_size, uint32_t glwe_dimension,
|
||||
int level_count, int iteration, G &group,
|
||||
bool support_dsm = false) {
|
||||
__device__ void
|
||||
mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer,
|
||||
const double2 *__restrict__ bootstrapping_key,
|
||||
int polynomial_size, uint32_t glwe_dimension, int level_count,
|
||||
int iteration, G &group, bool support_dsm = false) {
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
@@ -118,22 +117,17 @@ __device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, std::vector<Torus *> lut_vec,
|
||||
std::vector<Torus *> lut_indexes_vec, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, void **bootstrapping_keys,
|
||||
std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, PBS_TYPE pbs_type,
|
||||
bool sync_streams = true) {
|
||||
auto active_gpu_count =
|
||||
get_active_gpu_count(input_lwe_ciphertext_count, gpu_count);
|
||||
if (sync_streams)
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
void execute_pbs_async(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
const LweArrayVariant<Torus> &lwe_array_out,
|
||||
const LweArrayVariant<Torus> &lwe_output_indexes,
|
||||
std::vector<Torus *> lut_vec, std::vector<Torus *> lut_indexes_vec,
|
||||
const LweArrayVariant<Torus> &lwe_array_in,
|
||||
const LweArrayVariant<Torus> &lwe_input_indexes, void **bootstrapping_keys,
|
||||
std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type) {
|
||||
switch (sizeof(Torus)) {
|
||||
case sizeof(uint32_t):
|
||||
// 32 bits
|
||||
@@ -141,20 +135,31 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
case MULTI_BIT:
|
||||
PANIC("Error: 32-bit multibit PBS is not supported.\n")
|
||||
case CLASSICAL:
|
||||
#pragma omp parallel for num_threads(active_gpu_count)
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
int num_inputs_on_gpu =
|
||||
get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
|
||||
|
||||
int gpu_offset =
|
||||
get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
|
||||
auto d_lut_vector_indexes =
|
||||
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
|
||||
|
||||
// Use the macro to get the correct elements for the current iteration
|
||||
// Handles the case when the input/output are scattered through
|
||||
// different gpus and when it is not
|
||||
Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
|
||||
Torus *current_lwe_output_indexes =
|
||||
GET_VARIANT_ELEMENT(lwe_output_indexes, i);
|
||||
Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
|
||||
Torus *current_lwe_input_indexes =
|
||||
GET_VARIANT_ELEMENT(lwe_input_indexes, i);
|
||||
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
|
||||
lut_vec[i], d_lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
streams[i], gpu_indexes[i], current_lwe_array_out,
|
||||
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
|
||||
current_lwe_array_in, current_lwe_input_indexes,
|
||||
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts,
|
||||
lwe_idx, max_shared_memory, gpu_offset);
|
||||
polynomial_size, base_log, level_count, num_inputs_on_gpu);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
@@ -168,38 +173,60 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
case MULTI_BIT:
|
||||
if (grouping_factor == 0)
|
||||
PANIC("Multi-bit PBS error: grouping factor should be > 0.")
|
||||
#pragma omp parallel for num_threads(active_gpu_count)
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
int num_inputs_on_gpu =
|
||||
get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
|
||||
|
||||
// Use the macro to get the correct elements for the current iteration
|
||||
// Handles the case when the input/output are scattered through
|
||||
// different gpus and when it is not
|
||||
Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
|
||||
Torus *current_lwe_output_indexes =
|
||||
GET_VARIANT_ELEMENT(lwe_output_indexes, i);
|
||||
Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
|
||||
Torus *current_lwe_input_indexes =
|
||||
GET_VARIANT_ELEMENT(lwe_input_indexes, i);
|
||||
|
||||
int gpu_offset =
|
||||
get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
|
||||
auto d_lut_vector_indexes =
|
||||
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
|
||||
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
|
||||
lut_vec[i], d_lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
streams[i], gpu_indexes[i], current_lwe_array_out,
|
||||
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
|
||||
current_lwe_array_in, current_lwe_input_indexes,
|
||||
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
|
||||
polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_inputs_on_gpu, num_luts, lwe_idx, max_shared_memory,
|
||||
gpu_offset);
|
||||
num_inputs_on_gpu);
|
||||
}
|
||||
break;
|
||||
case CLASSICAL:
|
||||
#pragma omp parallel for num_threads(active_gpu_count)
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
int num_inputs_on_gpu =
|
||||
get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
|
||||
|
||||
// Use the macro to get the correct elements for the current iteration
|
||||
// Handles the case when the input/output are scattered through
|
||||
// different gpus and when it is not
|
||||
Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
|
||||
Torus *current_lwe_output_indexes =
|
||||
GET_VARIANT_ELEMENT(lwe_output_indexes, i);
|
||||
Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
|
||||
Torus *current_lwe_input_indexes =
|
||||
GET_VARIANT_ELEMENT(lwe_input_indexes, i);
|
||||
|
||||
int gpu_offset =
|
||||
get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
|
||||
auto d_lut_vector_indexes =
|
||||
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
|
||||
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
|
||||
lut_vec[i], d_lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
streams[i], gpu_indexes[i], current_lwe_array_out,
|
||||
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
|
||||
current_lwe_array_in, current_lwe_input_indexes,
|
||||
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts,
|
||||
lwe_idx, max_shared_memory, gpu_offset);
|
||||
polynomial_size, base_log, level_count, num_inputs_on_gpu);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
@@ -210,11 +237,6 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
|
||||
"moduli are supported.")
|
||||
}
|
||||
|
||||
if (sync_streams)
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -222,8 +244,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, PBS_TYPE pbs_type,
|
||||
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
switch (sizeof(Torus)) {
|
||||
case sizeof(uint32_t):
|
||||
@@ -234,8 +255,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
|
||||
case CLASSICAL:
|
||||
scratch_cuda_programmable_bootstrap_32(
|
||||
stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Error: unsupported cuda PBS type.")
|
||||
@@ -250,13 +270,12 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
|
||||
scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, level_count, grouping_factor,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
break;
|
||||
case CLASSICAL:
|
||||
scratch_cuda_programmable_bootstrap_64(
|
||||
stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Error: unsupported cuda PBS type.")
|
||||
|
||||
@@ -5,10 +5,9 @@
|
||||
*/
|
||||
uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
return get_buffer_size_programmable_bootstrap_amortized<uint64_t>(
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory);
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -20,58 +19,50 @@ uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
|
||||
void scratch_cuda_programmable_bootstrap_amortized_32(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
|
||||
AmortizedDegree<256>>(
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
|
||||
AmortizedDegree<512>>(
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
|
||||
AmortizedDegree<1024>>(
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
|
||||
AmortizedDegree<2048>>(
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
|
||||
AmortizedDegree<4096>>(
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
|
||||
AmortizedDegree<8192>>(
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, int32_t,
|
||||
AmortizedDegree<16384>>(
|
||||
scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
|
||||
@@ -88,58 +79,50 @@ void scratch_cuda_programmable_bootstrap_amortized_32(
|
||||
void scratch_cuda_programmable_bootstrap_amortized_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
|
||||
AmortizedDegree<256>>(
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
|
||||
AmortizedDegree<512>>(
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, int64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
|
||||
@@ -157,8 +140,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t gpu_offset) {
|
||||
uint32_t num_samples) {
|
||||
|
||||
if (base_log > 32)
|
||||
PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
|
||||
@@ -172,7 +154,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
num_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
|
||||
@@ -181,7 +163,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
num_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
|
||||
@@ -190,7 +172,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
num_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
|
||||
@@ -199,7 +181,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
num_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
|
||||
@@ -208,7 +190,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
num_samples);
|
||||
break;
|
||||
case 8192:
|
||||
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
|
||||
@@ -217,7 +199,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
num_samples);
|
||||
break;
|
||||
case 16384:
|
||||
host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
|
||||
@@ -226,7 +208,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
|
||||
@@ -268,17 +250,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
* - base_log: log of the base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - level_count: number of decomposition levels in the gadget matrix (~4)
|
||||
* - num_samples: number of encrypted input messages
|
||||
* - num_luts: parameter to set the actual number of luts to be
|
||||
* used
|
||||
* - lwe_idx: the index of the LWE input to consider for the GPU of index
|
||||
* gpu_index. In case of multi-GPU computing, it is assumed that only a part of
|
||||
* the input LWE array is copied to each GPU, but the whole LUT array is copied
|
||||
* (because the case when the number of LUTs is smaller than the number of input
|
||||
* LWEs is not trivial to take into account in the data repartition on the
|
||||
* GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
|
||||
* input in the LUT array `lut_vector`.
|
||||
* - 'max_shared_memory' maximum amount of shared memory to be used inside
|
||||
* device functions
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the
|
||||
* bootstrapping:
|
||||
@@ -306,8 +278,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t gpu_offset) {
|
||||
uint32_t num_samples) {
|
||||
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
|
||||
@@ -321,7 +292,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
num_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
|
||||
@@ -330,7 +301,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
num_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
|
||||
@@ -339,7 +310,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
num_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
|
||||
@@ -348,7 +319,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
num_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
|
||||
@@ -357,7 +328,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
num_samples);
|
||||
break;
|
||||
case 8192:
|
||||
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
|
||||
@@ -366,7 +337,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
num_samples);
|
||||
break;
|
||||
case 16384:
|
||||
host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
|
||||
@@ -375,7 +346,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
|
||||
@@ -392,7 +363,6 @@ void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
|
||||
uint32_t gpu_index,
|
||||
int8_t **pbs_buffer) {
|
||||
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
// Free memory
|
||||
cuda_drop_async(*pbs_buffer, static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
|
||||
@@ -42,17 +42,19 @@ template <typename Torus, class params, sharedMemDegree SMD>
|
||||
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - level_count: number of decomposition levels in the gadget matrix (~4)
|
||||
* - gpu_num: index of the current GPU (useful for multi-GPU computations)
|
||||
* - lwe_idx: equal to the number of samples per gpu x gpu_num
|
||||
* - device_memory_size_per_sample: amount of global memory to allocate if SMD
|
||||
* is not FULLSM
|
||||
*/
|
||||
__global__ void device_programmable_bootstrap_amortized(
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
double2 *bootstrapping_key, int8_t *device_mem, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t lwe_idx,
|
||||
size_t device_memory_size_per_sample, uint32_t gpu_offset) {
|
||||
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const double2 *__restrict__ bootstrapping_key, int8_t *device_mem,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
size_t device_memory_size_per_sample) {
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
@@ -79,16 +81,15 @@ __global__ void device_programmable_bootstrap_amortized(
|
||||
(ptrdiff_t)((glwe_dimension + 1) * polynomial_size / 2);
|
||||
|
||||
auto block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x + gpu_offset] *
|
||||
(lwe_dimension + 1)];
|
||||
Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree *
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
const Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
|
||||
(glwe_dimension + 1)];
|
||||
|
||||
// Put "b", the body, in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
@@ -103,8 +104,8 @@ __global__ void device_programmable_bootstrap_amortized(
|
||||
|
||||
// Put "a" in [0, 2N[ instead of Zq
|
||||
Torus a_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[iteration], a_hat,
|
||||
2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
modulus_switch(block_lwe_array_in[iteration], a_hat,
|
||||
params::log2_degree + 1);
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
@@ -198,7 +199,7 @@ __global__ void device_programmable_bootstrap_amortized(
|
||||
}
|
||||
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.x + gpu_offset] *
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.x] *
|
||||
(glwe_dimension * polynomial_size + 1)];
|
||||
|
||||
// The blind rotation for this block is over
|
||||
@@ -212,8 +213,7 @@ __global__ void device_programmable_bootstrap_amortized(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_programmable_bootstrap_amortized(
|
||||
uint64_t get_buffer_size_full_sm_programmable_bootstrap_amortized(
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension) {
|
||||
return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
|
||||
sizeof(Torus) * polynomial_size *
|
||||
@@ -224,17 +224,17 @@ get_buffer_size_full_sm_programmable_bootstrap_amortized(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_amortized(
|
||||
uint64_t get_buffer_size_partial_sm_programmable_bootstrap_amortized(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap_amortized(
|
||||
uint64_t get_buffer_size_programmable_bootstrap_amortized(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
|
||||
polynomial_size, glwe_dimension);
|
||||
@@ -252,20 +252,19 @@ __host__ __device__ uint64_t get_buffer_size_programmable_bootstrap_amortized(
|
||||
return device_mem + device_mem % sizeof(double2);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
template <typename Torus, typename params>
|
||||
__host__ void scratch_programmable_bootstrap_amortized(
|
||||
cudaStream_t stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
|
||||
polynomial_size, glwe_dimension);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
|
||||
polynomial_size);
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
|
||||
cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
@@ -284,8 +283,7 @@ __host__ void scratch_programmable_bootstrap_amortized(
|
||||
if (allocate_gpu_memory) {
|
||||
uint64_t buffer_size =
|
||||
get_buffer_size_programmable_bootstrap_amortized<Torus>(
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory);
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count);
|
||||
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -298,10 +296,8 @@ __host__ void host_programmable_bootstrap_amortized(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t gpu_offset) {
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
uint64_t SM_FULL =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
|
||||
polynomial_size, glwe_dimension);
|
||||
@@ -314,6 +310,9 @@ __host__ void host_programmable_bootstrap_amortized(
|
||||
|
||||
uint64_t DM_FULL = SM_FULL;
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
// where each block handles 1 sample and each thread
|
||||
// handles opt polynomial coefficients
|
||||
@@ -333,14 +332,14 @@ __host__ void host_programmable_bootstrap_amortized(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, lwe_idx, DM_FULL, gpu_offset);
|
||||
level_count, DM_FULL);
|
||||
} else if (max_shared_memory < SM_FULL) {
|
||||
device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, SM_PART, stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, lwe_idx, DM_PART, gpu_offset);
|
||||
level_count, DM_PART);
|
||||
} else {
|
||||
// For devices with compute capability 7.x a single thread block can
|
||||
// address the full capacity of shared memory. Shared memory on the
|
||||
@@ -352,7 +351,7 @@ __host__ void host_programmable_bootstrap_amortized(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, lwe_idx, 0, gpu_offset);
|
||||
level_count, 0);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -36,12 +36,15 @@ namespace cg = cooperative_groups;
|
||||
*/
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_programmable_bootstrap_cg(
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block,
|
||||
uint32_t gpu_offset) {
|
||||
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block) {
|
||||
|
||||
grid_group grid = this_grid();
|
||||
|
||||
@@ -74,12 +77,12 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
|
||||
(lwe_dimension + 1)];
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
|
||||
|
||||
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
|
||||
params::degree * (glwe_dimension + 1)];
|
||||
const Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
|
||||
(glwe_dimension + 1)];
|
||||
|
||||
double2 *block_join_buffer =
|
||||
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
|
||||
@@ -90,8 +93,8 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
2 * params::degree);
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
@@ -103,8 +106,7 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
|
||||
// Put "a" in [0, 2N[
|
||||
Torus a_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[i], a_hat,
|
||||
2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1);
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
@@ -140,7 +142,7 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
}
|
||||
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] *
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
@@ -154,20 +156,19 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
template <typename Torus, typename params>
|
||||
__host__ void scratch_programmable_bootstrap_cg(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
|
||||
polynomial_size);
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
|
||||
@@ -201,9 +202,7 @@ __host__ void host_programmable_bootstrap_cg(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_luts, uint32_t max_shared_memory, uint32_t gpu_offset) {
|
||||
cudaSetDevice(gpu_index);
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
// duplicate data for each
|
||||
@@ -214,6 +213,9 @@ __host__ void host_programmable_bootstrap_cg(
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
uint64_t full_dm = full_sm;
|
||||
|
||||
uint64_t partial_dm = full_dm - partial_sm;
|
||||
@@ -224,7 +226,7 @@ __host__ void host_programmable_bootstrap_cg(
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
|
||||
|
||||
void *kernel_args[15];
|
||||
void *kernel_args[14];
|
||||
kernel_args[0] = &lwe_array_out;
|
||||
kernel_args[1] = &lwe_output_indexes;
|
||||
kernel_args[2] = &lut_vector;
|
||||
@@ -238,7 +240,6 @@ __host__ void host_programmable_bootstrap_cg(
|
||||
kernel_args[10] = &base_log;
|
||||
kernel_args[11] = &level_count;
|
||||
kernel_args[12] = &d_mem;
|
||||
kernel_args[14] = &gpu_offset;
|
||||
|
||||
if (max_shared_memory < partial_sm) {
|
||||
kernel_args[13] = &full_dm;
|
||||
@@ -264,8 +265,7 @@ __host__ void host_programmable_bootstrap_cg(
|
||||
// Verify if the grid size satisfies the cooperative group constraints
|
||||
template <typename Torus, class params>
|
||||
__host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
|
||||
int glwe_dimension, int level_count, int num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
int glwe_dimension, int level_count, int num_samples) {
|
||||
|
||||
// If Cooperative Groups is not supported, no need to check anything else
|
||||
if (!cuda_check_support_cooperative_groups())
|
||||
@@ -285,6 +285,7 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
|
||||
int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
|
||||
int max_active_blocks_per_sm;
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
if (max_shared_memory < partial_sm) {
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
@@ -310,37 +311,30 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
|
||||
// Verify if the grid size satisfies the cooperative group constraints
|
||||
template <typename Torus>
|
||||
__host__ bool supports_cooperative_groups_on_programmable_bootstrap(
|
||||
int glwe_dimension, int polynomial_size, int level_count, int num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
int glwe_dimension, int polynomial_size, int level_count, int num_samples) {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return verify_cuda_programmable_bootstrap_cg_grid_size<
|
||||
Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples);
|
||||
case 512:
|
||||
return verify_cuda_programmable_bootstrap_cg_grid_size<
|
||||
Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples);
|
||||
case 1024:
|
||||
return verify_cuda_programmable_bootstrap_cg_grid_size<
|
||||
Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples);
|
||||
case 2048:
|
||||
return verify_cuda_programmable_bootstrap_cg_grid_size<
|
||||
Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples);
|
||||
case 4096:
|
||||
return verify_cuda_programmable_bootstrap_cg_grid_size<
|
||||
Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples);
|
||||
case 8192:
|
||||
return verify_cuda_programmable_bootstrap_cg_grid_size<
|
||||
Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples);
|
||||
case 16384:
|
||||
return verify_cuda_programmable_bootstrap_cg_grid_size<
|
||||
Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
Torus, AmortizedDegree<16384>>(glwe_dimension, level_count,
|
||||
num_samples);
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
|
||||
@@ -18,15 +18,19 @@
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
double2 *keybundle_array, double2 *join_buffer, Torus *global_accumulator,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t lwe_offset, uint32_t lwe_chunk_size,
|
||||
uint32_t keybundle_size_per_input, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
|
||||
__global__ void __launch_bounds__(params::degree / params::opt)
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate(
|
||||
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const double2 *__restrict__ keybundle_array, double2 *join_buffer,
|
||||
Torus *global_accumulator, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
|
||||
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block) {
|
||||
|
||||
grid_group grid = this_grid();
|
||||
|
||||
@@ -54,12 +58,12 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
|
||||
(lwe_dimension + 1)];
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
|
||||
|
||||
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
|
||||
params::degree * (glwe_dimension + 1)];
|
||||
const Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
|
||||
(glwe_dimension + 1)];
|
||||
|
||||
double2 *block_join_buffer =
|
||||
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
|
||||
@@ -69,15 +73,15 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
|
||||
global_accumulator +
|
||||
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
|
||||
|
||||
double2 *keybundle = keybundle_array +
|
||||
// select the input
|
||||
blockIdx.z * keybundle_size_per_input;
|
||||
const double2 *keybundle = keybundle_array +
|
||||
// select the input
|
||||
blockIdx.z * keybundle_size_per_input;
|
||||
|
||||
if (lwe_offset == 0) {
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
2 * params::degree);
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
@@ -117,7 +121,7 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
|
||||
|
||||
if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] *
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
@@ -137,24 +141,21 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
|
||||
uint64_t get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size; // accumulator
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
|
||||
uint64_t get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size * 2; // accumulator
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
|
||||
uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t grouping_factor, uint32_t lwe_chunk_size,
|
||||
uint32_t max_shared_memory) {
|
||||
uint32_t grouping_factor, uint32_t lwe_chunk_size) {
|
||||
|
||||
uint64_t buffer_size = 0;
|
||||
buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
|
||||
@@ -169,15 +170,13 @@ __host__ __device__ uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
|
||||
return buffer_size + buffer_size % sizeof(double2);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
template <typename Torus, typename params>
|
||||
__host__ void scratch_cg_multi_bit_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> **buffer, uint32_t glwe_dimension,
|
||||
pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
|
||||
polynomial_size);
|
||||
@@ -188,6 +187,7 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
|
||||
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
if (max_shared_memory < full_sm_keybundle) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
@@ -240,11 +240,9 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size =
|
||||
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
|
||||
polynomial_size, max_shared_memory);
|
||||
*buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size);
|
||||
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::CG,
|
||||
allocate_gpu_memory);
|
||||
@@ -258,10 +256,8 @@ __host__ void execute_cg_external_product_loop(
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset,
|
||||
uint32_t gpu_offset) {
|
||||
uint32_t lwe_chunk_size, int lwe_offset) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
uint64_t full_dm =
|
||||
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
@@ -270,6 +266,9 @@ __host__ void execute_cg_external_product_loop(
|
||||
polynomial_size);
|
||||
uint64_t no_dm = 0;
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
uint32_t keybundle_size_per_input =
|
||||
lwe_chunk_size * level_count * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * (polynomial_size / 2);
|
||||
@@ -282,7 +281,7 @@ __host__ void execute_cg_external_product_loop(
|
||||
auto global_accumulator = buffer->global_accumulator;
|
||||
auto buffer_fft = buffer->global_accumulator_fft;
|
||||
|
||||
void *kernel_args[21];
|
||||
void *kernel_args[20];
|
||||
kernel_args[0] = &lwe_array_out;
|
||||
kernel_args[1] = &lwe_output_indexes;
|
||||
kernel_args[2] = &lut_vector;
|
||||
@@ -302,7 +301,6 @@ __host__ void execute_cg_external_product_loop(
|
||||
kernel_args[16] = &chunk_size;
|
||||
kernel_args[17] = &keybundle_size_per_input;
|
||||
kernel_args[18] = &d_mem;
|
||||
kernel_args[20] = &gpu_offset;
|
||||
|
||||
dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
@@ -328,21 +326,17 @@ __host__ void execute_cg_external_product_loop(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, class params>
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_cg_multi_bit_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t gpu_offset, uint32_t lwe_chunk_size = 0) {
|
||||
cudaSetDevice(gpu_index);
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, num_samples, polynomial_size, max_shared_memory);
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, num_samples, polynomial_size);
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
@@ -351,24 +345,21 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
|
||||
execute_compute_keybundle<Torus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, max_shared_memory,
|
||||
lwe_chunk_size, lwe_offset, gpu_offset);
|
||||
grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset);
|
||||
|
||||
// Accumulate
|
||||
execute_cg_external_product_loop<Torus, params>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, lwe_chunk_size,
|
||||
max_shared_memory, lwe_offset, gpu_offset);
|
||||
grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset);
|
||||
}
|
||||
}
|
||||
|
||||
// Verify if the grid size satisfies the cooperative group constraints
|
||||
template <typename Torus, class params>
|
||||
__host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
|
||||
int glwe_dimension, int level_count, int num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
int glwe_dimension, int level_count, int num_samples) {
|
||||
|
||||
// If Cooperative Groups is not supported, no need to check anything else
|
||||
if (!cuda_check_support_cooperative_groups())
|
||||
@@ -388,6 +379,7 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
|
||||
int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
|
||||
int max_active_blocks_per_sm;
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
if (max_shared_memory < partial_sm_cg_accumulate) {
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
@@ -418,37 +410,30 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
|
||||
// group constraints
|
||||
template <typename Torus>
|
||||
__host__ bool supports_cooperative_groups_on_multibit_programmable_bootstrap(
|
||||
int glwe_dimension, int polynomial_size, int level_count, int num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
int glwe_dimension, int polynomial_size, int level_count, int num_samples) {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
|
||||
Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples);
|
||||
case 512:
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
|
||||
Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples);
|
||||
case 1024:
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
|
||||
Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples);
|
||||
case 2048:
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
|
||||
Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples);
|
||||
case 4096:
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
|
||||
Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples);
|
||||
case 8192:
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
|
||||
Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples);
|
||||
case 16384:
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
|
||||
Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
Torus, AmortizedDegree<16384>>(glwe_dimension, level_count,
|
||||
num_samples);
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
|
||||
@@ -8,54 +8,46 @@ template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count,
|
||||
uint32_t num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
uint32_t num_samples) {
|
||||
return supports_cooperative_groups_on_programmable_bootstrap<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
glwe_dimension, polynomial_size, level_count, num_samples);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory) {
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count) {
|
||||
#if CUDA_ARCH >= 900
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
polynomial_size, level_count);
|
||||
case 512:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
polynomial_size, level_count);
|
||||
case 1024:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
polynomial_size, level_count);
|
||||
case 2048:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
polynomial_size, level_count);
|
||||
case 4096:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
polynomial_size, level_count);
|
||||
case 8192:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
polynomial_size, level_count);
|
||||
case 16384:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
polynomial_size, level_count);
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
@@ -67,55 +59,54 @@ bool has_support_to_cuda_programmable_bootstrap_tbc(
|
||||
}
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
template <typename Torus, typename STorus>
|
||||
template <typename Torus>
|
||||
void scratch_cuda_programmable_bootstrap_tbc(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<256>>(
|
||||
scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<512>>(
|
||||
scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<1024>>(
|
||||
scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<2048>>(
|
||||
scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<4096>>(
|
||||
scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<8192>>(
|
||||
scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<16384>>(
|
||||
scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
@@ -131,8 +122,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset) {
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -140,56 +130,49 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_tbc<Torus, Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_tbc<Torus, Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 8192:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 16384:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
@@ -204,69 +187,68 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
*/
|
||||
uint64_t get_buffer_size_programmable_bootstrap_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
input_lwe_ciphertext_count))
|
||||
return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
input_lwe_ciphertext_count);
|
||||
else
|
||||
return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
template <typename Torus>
|
||||
void scratch_cuda_programmable_bootstrap_cg(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<256>>(
|
||||
scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<512>>(
|
||||
scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<1024>>(
|
||||
scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<2048>>(
|
||||
scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<4096>>(
|
||||
scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<8192>>(
|
||||
scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_programmable_bootstrap_cg<Torus, STorus, AmortizedDegree<16384>>(
|
||||
scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
@@ -275,55 +257,54 @@ void scratch_cuda_programmable_bootstrap_cg(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
template <typename Torus>
|
||||
void scratch_cuda_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<256>>(
|
||||
scratch_programmable_bootstrap<Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<512>>(
|
||||
scratch_programmable_bootstrap<Torus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<1024>>(
|
||||
scratch_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<2048>>(
|
||||
scratch_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<4096>>(
|
||||
scratch_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<8192>>(
|
||||
scratch_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_programmable_bootstrap<Torus, STorus, AmortizedDegree<16384>>(
|
||||
scratch_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
@@ -341,31 +322,30 @@ void scratch_cuda_programmable_bootstrap(
|
||||
void scratch_cuda_programmable_bootstrap_32(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
level_count, max_shared_memory))
|
||||
scratch_cuda_programmable_bootstrap_tbc<uint32_t, int32_t>(
|
||||
level_count))
|
||||
scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
else
|
||||
#endif
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
scratch_cuda_programmable_bootstrap_cg<uint32_t, int32_t>(
|
||||
input_lwe_ciphertext_count))
|
||||
scratch_cuda_programmable_bootstrap_cg<uint32_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
else
|
||||
scratch_cuda_programmable_bootstrap<uint32_t, int32_t>(
|
||||
scratch_cuda_programmable_bootstrap<uint32_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -376,31 +356,30 @@ void scratch_cuda_programmable_bootstrap_32(
|
||||
void scratch_cuda_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
level_count, max_shared_memory))
|
||||
scratch_cuda_programmable_bootstrap_tbc<uint64_t, int64_t>(
|
||||
level_count))
|
||||
scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
else
|
||||
#endif
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
scratch_cuda_programmable_bootstrap_cg<uint64_t, int64_t>(
|
||||
input_lwe_ciphertext_count))
|
||||
scratch_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
else
|
||||
scratch_cuda_programmable_bootstrap<uint64_t, int64_t>(
|
||||
scratch_cuda_programmable_bootstrap<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -410,8 +389,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset) {
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -419,56 +397,49 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_cg<Torus, Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_cg<Torus, Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 8192:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 16384:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
@@ -484,8 +455,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset) {
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -493,56 +463,49 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap<Torus, Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap<Torus, Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 8192:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 16384:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, max_shared_memory, gpu_offset);
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
@@ -559,15 +522,14 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t gpu_offset) {
|
||||
uint32_t num_samples) {
|
||||
|
||||
if (base_log > 32)
|
||||
PANIC("Cuda error (classical PBS): base log should be > number of bits "
|
||||
"in the ciphertext representation (32)");
|
||||
|
||||
pbs_buffer<uint64_t, CLASSICAL> *buffer =
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
|
||||
pbs_buffer<uint32_t, CLASSICAL> *buffer =
|
||||
(pbs_buffer<uint32_t, CLASSICAL> *)mem_ptr;
|
||||
|
||||
switch (buffer->pbs_variant) {
|
||||
case TBC:
|
||||
@@ -579,14 +541,12 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key),
|
||||
(pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
#else
|
||||
PANIC("Cuda error (PBS): TBC pbs is not supported.")
|
||||
#endif
|
||||
break;
|
||||
case CG:
|
||||
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
|
||||
stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
|
||||
@@ -595,10 +555,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key),
|
||||
(pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case DEFAULT:
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
|
||||
@@ -608,10 +566,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
static_cast<uint32_t *>(lut_vector_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key),
|
||||
(pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unknown pbs variant.")
|
||||
@@ -654,17 +610,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - level_count: number of decomposition levels in the gadget matrix (~4)
|
||||
* - num_samples: number of encrypted input messages
|
||||
* - num_luts: parameter to set the actual number of luts to be
|
||||
* used
|
||||
* - lwe_idx: the index of the LWE input to consider for the GPU of index
|
||||
* gpu_index. In case of multi-GPU computing, it is assumed that only a part of
|
||||
* the input LWE array is copied to each GPU, but the whole LUT array is copied
|
||||
* (because the case when the number of LUTs is smaller than the number of input
|
||||
* LWEs is not trivial to take into account in the data repartition on the
|
||||
* GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
|
||||
* input in the LUT array `lut_vector`.
|
||||
* - 'max_shared_memory' maximum amount of shared memory to be used inside
|
||||
* device functions
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the
|
||||
* bootstrapping:
|
||||
@@ -696,8 +641,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t gpu_offset) {
|
||||
uint32_t num_samples) {
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (classical PBS): base log should be > number of bits "
|
||||
"in the ciphertext representation (64)");
|
||||
@@ -715,14 +659,12 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key),
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
#else
|
||||
PANIC("Cuda error (PBS): TBC pbs is not supported.")
|
||||
#endif
|
||||
break;
|
||||
case PBS_VARIANT::CG:
|
||||
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
@@ -731,10 +673,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key),
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
@@ -744,10 +684,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key),
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_luts, lwe_idx, max_shared_memory, gpu_offset);
|
||||
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unknown pbs variant.")
|
||||
@@ -760,14 +698,13 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
*/
|
||||
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
|
||||
int8_t **buffer) {
|
||||
cudaSetDevice(gpu_index);
|
||||
auto x = (pbs_buffer<uint64_t, CLASSICAL> *)(*buffer);
|
||||
x->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
|
||||
template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t max_shared_memory);
|
||||
uint32_t num_samples);
|
||||
|
||||
template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
@@ -776,8 +713,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
|
||||
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset);
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
@@ -786,21 +722,18 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset);
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
template void scratch_cuda_programmable_bootstrap_cg<uint64_t, int64_t>(
|
||||
template void scratch_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template void scratch_cuda_programmable_bootstrap<uint64_t, int64_t>(
|
||||
template void scratch_cuda_programmable_bootstrap<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, CLASSICAL> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
|
||||
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
|
||||
@@ -809,8 +742,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
|
||||
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset);
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
|
||||
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
|
||||
@@ -819,28 +751,25 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
|
||||
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset);
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
template void scratch_cuda_programmable_bootstrap_cg<uint32_t, int32_t>(
|
||||
template void scratch_cuda_programmable_bootstrap_cg<uint32_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template void scratch_cuda_programmable_bootstrap<uint32_t, int32_t>(
|
||||
template void scratch_cuda_programmable_bootstrap<uint32_t>(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<uint32_t, CLASSICAL> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template bool has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
uint32_t level_count);
|
||||
template bool has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
uint32_t level_count);
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
|
||||
@@ -850,8 +779,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
|
||||
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<uint32_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset);
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
|
||||
@@ -859,18 +787,15 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
|
||||
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset);
|
||||
template void scratch_cuda_programmable_bootstrap_tbc<uint32_t, int32_t>(
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
template void scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
template void scratch_cuda_programmable_bootstrap_tbc<uint64_t, int64_t>(
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
template void scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
#endif
|
||||
|
||||
@@ -17,13 +17,17 @@
|
||||
#include "types/complex/operations.cuh"
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_programmable_bootstrap_step_one(
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
|
||||
__global__ void __launch_bounds__(params::degree / params::opt)
|
||||
device_programmable_bootstrap_step_one(
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const double2 *__restrict__ bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
uint32_t lwe_iteration, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block) {
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
@@ -50,12 +54,12 @@ __global__ void device_programmable_bootstrap_step_one(
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
|
||||
(lwe_dimension + 1)];
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
|
||||
|
||||
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
|
||||
params::degree * (glwe_dimension + 1)];
|
||||
const Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
|
||||
(glwe_dimension + 1)];
|
||||
|
||||
Torus *global_slice =
|
||||
global_accumulator +
|
||||
@@ -71,8 +75,8 @@ __global__ void device_programmable_bootstrap_step_one(
|
||||
// First iteration
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
2 * params::degree);
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
// The y-dimension is used to select the element of the GLWE this block will
|
||||
// compute
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
@@ -90,8 +94,8 @@ __global__ void device_programmable_bootstrap_step_one(
|
||||
|
||||
// Put "a" in [0, 2N[
|
||||
Torus a_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[lwe_iteration], a_hat,
|
||||
2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
|
||||
params::log2_degree + 1); // 2 * params::log2_degree + 1);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
@@ -128,13 +132,16 @@ __global__ void device_programmable_bootstrap_step_one(
|
||||
}
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_programmable_bootstrap_step_two(
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, double2 *bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
|
||||
__global__ void __launch_bounds__(params::degree / params::opt)
|
||||
device_programmable_bootstrap_step_two(
|
||||
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes,
|
||||
const double2 *__restrict__ bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
uint32_t lwe_iteration, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block) {
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
@@ -200,7 +207,7 @@ __global__ void device_programmable_bootstrap_step_two(
|
||||
if (lwe_iteration + 1 == lwe_dimension) {
|
||||
// Last iteration
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.x + gpu_offset] *
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.x] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
@@ -223,9 +230,9 @@ __global__ void device_programmable_bootstrap_step_two(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap(
|
||||
uint64_t get_buffer_size_programmable_bootstrap(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
uint64_t full_sm_step_one =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
|
||||
@@ -241,6 +248,7 @@ __host__ __device__ uint64_t get_buffer_size_programmable_bootstrap(
|
||||
uint64_t full_dm = full_sm_step_one;
|
||||
|
||||
uint64_t device_mem = 0;
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
@@ -263,15 +271,13 @@ __host__ __device__ uint64_t get_buffer_size_programmable_bootstrap(
|
||||
return buffer_size + buffer_size % sizeof(double2);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
template <typename Torus, typename params>
|
||||
__host__ void scratch_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
uint64_t full_sm_step_one =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
|
||||
polynomial_size);
|
||||
@@ -281,6 +287,8 @@ __host__ void scratch_programmable_bootstrap(
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap<Torus>(polynomial_size);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
|
||||
// Configure step one
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
@@ -333,10 +341,10 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, int8_t *d_mem,
|
||||
uint32_t max_shared_memory, int lwe_iteration,
|
||||
uint64_t partial_sm, uint64_t partial_dm, uint64_t full_sm,
|
||||
uint64_t full_dm, uint32_t gpu_offset) {
|
||||
int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
|
||||
uint64_t full_sm, uint64_t full_dm) {
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
cudaSetDevice(gpu_index);
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
|
||||
@@ -347,21 +355,21 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, full_dm, gpu_offset);
|
||||
level_count, d_mem, full_dm);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, partial_sm, stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, partial_dm, gpu_offset);
|
||||
level_count, d_mem, partial_dm);
|
||||
} else {
|
||||
device_programmable_bootstrap_step_one<Torus, params, FULLSM>
|
||||
<<<grid, thds, full_sm, stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, 0, gpu_offset);
|
||||
level_count, d_mem, 0);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -375,10 +383,10 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, int8_t *d_mem,
|
||||
uint32_t max_shared_memory, int lwe_iteration,
|
||||
uint64_t partial_sm, uint64_t partial_dm, uint64_t full_sm,
|
||||
uint64_t full_dm, uint32_t gpu_offset) {
|
||||
int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
|
||||
uint64_t full_sm, uint64_t full_dm) {
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
cudaSetDevice(gpu_index);
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
|
||||
@@ -389,21 +397,21 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, full_dm, gpu_offset);
|
||||
level_count, d_mem, full_dm);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, partial_sm, stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, partial_dm, gpu_offset);
|
||||
level_count, d_mem, partial_dm);
|
||||
} else {
|
||||
device_programmable_bootstrap_step_two<Torus, params, FULLSM>
|
||||
<<<grid, thds, full_sm, stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, 0, gpu_offset);
|
||||
level_count, d_mem, 0);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -417,8 +425,7 @@ __host__ void host_programmable_bootstrap(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_luts, uint32_t max_shared_memory, uint32_t gpu_offset) {
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count) {
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
@@ -447,16 +454,14 @@ __host__ void host_programmable_bootstrap(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, global_accumulator,
|
||||
global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem,
|
||||
max_shared_memory, i, partial_sm, partial_dm_step_one, full_sm_step_one,
|
||||
full_dm_step_one, gpu_offset);
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
|
||||
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
|
||||
execute_step_two<Torus, params>(
|
||||
stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, bootstrapping_key, global_accumulator,
|
||||
global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem,
|
||||
max_shared_memory, i, partial_sm, partial_dm_step_two, full_sm_step_two,
|
||||
full_dm_step_two, gpu_offset);
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
|
||||
partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -9,53 +9,45 @@
|
||||
|
||||
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t max_shared_memory) {
|
||||
uint32_t num_samples) {
|
||||
return supports_cooperative_groups_on_multibit_programmable_bootstrap<
|
||||
uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory) {
|
||||
uint32_t level_count) {
|
||||
#if CUDA_ARCH >= 900
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
polynomial_size, level_count);
|
||||
case 512:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
polynomial_size, level_count);
|
||||
case 1024:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
polynomial_size, level_count);
|
||||
case 2048:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
polynomial_size, level_count);
|
||||
case 4096:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
polynomial_size, level_count);
|
||||
case 8192:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
polynomial_size, level_count);
|
||||
case 16384:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
polynomial_size, level_count);
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
@@ -73,9 +65,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t gpu_offset, uint32_t lwe_chunk_size) {
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
|
||||
@@ -83,74 +73,60 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_cg_multi_bit_programmable_bootstrap<uint64_t, int64_t,
|
||||
AmortizedDegree<256>>(
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<512>>(
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 8192:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 16384:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
@@ -166,9 +142,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t gpu_offset, uint32_t lwe_chunk_size) {
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
|
||||
@@ -176,73 +150,60 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_multi_bit_programmable_bootstrap<uint64_t, int64_t,
|
||||
AmortizedDegree<256>>(
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_multi_bit_programmable_bootstrap<Torus, int64_t, AmortizedDegree<512>>(
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 8192:
|
||||
host_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 16384:
|
||||
host_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
@@ -257,9 +218,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset,
|
||||
uint32_t lwe_chunk_size) {
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *buffer =
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
|
||||
@@ -274,15 +233,13 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key),
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
|
||||
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
#else
|
||||
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
|
||||
#endif
|
||||
break;
|
||||
case PBS_VARIANT::CG:
|
||||
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
@@ -293,8 +250,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
@@ -306,70 +262,61 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
template <typename Torus>
|
||||
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<256>>(
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<512>>(
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<1024>>(
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<2048>>(
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<4096>>(
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<8192>>(
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<16384>>(
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
@@ -378,70 +325,55 @@ void scratch_cuda_cg_multi_bit_programmable_bootstrap(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
template <typename Torus>
|
||||
void scratch_cuda_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<256>>(
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<512>>(
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<1024>>(
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<2048>>(
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<4096>>(
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<8192>>(
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<16384>>(
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
@@ -454,40 +386,35 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size) {
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
level_count, max_shared_memory))
|
||||
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
|
||||
level_count))
|
||||
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
grouping_factor, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
else
|
||||
#endif
|
||||
if (supports_cooperative_groups_on_multibit_programmable_bootstrap<
|
||||
uint64_t>(glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
|
||||
input_lwe_ciphertext_count))
|
||||
scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
else
|
||||
scratch_cuda_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
|
||||
scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
grouping_factor, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
uint32_t gpu_index,
|
||||
int8_t **buffer) {
|
||||
cudaSetDevice(gpu_index);
|
||||
auto x = (pbs_buffer<uint64_t, MULTI_BIT> *)(*buffer);
|
||||
x->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
@@ -504,15 +431,15 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
* benchmarking on an RTX 4090 GPU, balancing performance and resource use.
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t max_shared_memory) {
|
||||
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size) {
|
||||
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
int max_blocks_per_sm;
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
if (max_shared_memory < full_sm_keybundle)
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm,
|
||||
@@ -557,13 +484,12 @@ __host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
return divisor;
|
||||
}
|
||||
|
||||
template void scratch_cuda_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
|
||||
template void scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size);
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
template void
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
@@ -573,17 +499,13 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t gpu_offset, uint32_t lwe_chunk_size);
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
template void
|
||||
scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
|
||||
template void scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template void
|
||||
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
@@ -593,80 +515,63 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t gpu_offset, uint32_t lwe_chunk_size);
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
template bool
|
||||
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
uint32_t level_count);
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
template <typename Torus, typename STorus>
|
||||
template <typename Torus>
|
||||
void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<256>>(
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<512>>(
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<1024>>(
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<2048>>(
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<4096>>(
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<8192>>(
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
|
||||
AmortizedDegree<16384>>(
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
@@ -681,9 +586,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t gpu_offset, uint32_t lwe_chunk_size) {
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
|
||||
@@ -691,74 +594,60 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t,
|
||||
AmortizedDegree<256>>(
|
||||
host_tbc_multi_bit_programmable_bootstrap<uint64_t, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<512>>(
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 8192:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 16384:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
|
||||
lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
@@ -767,13 +656,11 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
}
|
||||
}
|
||||
|
||||
template void
|
||||
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
|
||||
template void scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template void
|
||||
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
@@ -783,7 +670,5 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t gpu_offset, uint32_t lwe_chunk_size);
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
#endif
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus, class params>
|
||||
__device__ Torus calculates_monomial_degree(Torus *lwe_array_group,
|
||||
__device__ Torus calculates_monomial_degree(const Torus *lwe_array_group,
|
||||
uint32_t ggsw_idx,
|
||||
uint32_t grouping_factor) {
|
||||
Torus x = 0;
|
||||
@@ -28,18 +28,101 @@ __device__ Torus calculates_monomial_degree(Torus *lwe_array_group,
|
||||
x += selection_bit * lwe_array_group[i];
|
||||
}
|
||||
|
||||
return rescale_torus_element(
|
||||
x, 2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
return modulus_switch(x, params::log2_degree + 1);
|
||||
}
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__device__ void compute_multi_bit_programmable_bootstrap_keybundle(
|
||||
const Torus *__restrict__ lwe_in,
|
||||
double2 *__restrict__ keybundle,
|
||||
const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t level_count, uint32_t lwe_chunk_size,
|
||||
uint32_t level_id, uint32_t glwe_id, uint32_t poly_id, uint32_t chunk_id, uint32_t lwe_iteration, Torus *accumulator){
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Computes all keybundles
|
||||
uint32_t rev_lwe_iteration =
|
||||
((lwe_dimension / grouping_factor) - lwe_iteration - 1);
|
||||
|
||||
// ////////////////////////////////
|
||||
// Keygen guarantees the first term is a constant term of the polynomial, no
|
||||
// polynomial multiplication required
|
||||
const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id,
|
||||
grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
|
||||
const Torus *bsk_poly = bsk_slice + poly_id * params::degree;
|
||||
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
bsk_poly, accumulator);
|
||||
|
||||
// Accumulate the other terms
|
||||
for (int g = 1; g < (1 << grouping_factor); g++) {
|
||||
|
||||
const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
bootstrapping_key, g, rev_lwe_iteration, glwe_id, level_id,
|
||||
grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
|
||||
const Torus *bsk_poly = bsk_slice + poly_id * params::degree;
|
||||
|
||||
// Calculates the monomial degree
|
||||
const Torus *lwe_array_group =
|
||||
lwe_in + rev_lwe_iteration * grouping_factor;
|
||||
uint32_t monomial_degree = calculates_monomial_degree<Torus, params>(
|
||||
lwe_array_group, g, grouping_factor);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
// Multiply by the bsk element
|
||||
polynomial_product_accumulate_by_monomial<Torus, params>(
|
||||
accumulator, bsk_poly, monomial_degree, false);
|
||||
}
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Move accumulator to local memory
|
||||
double2 temp[params::opt / 2];
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
temp[i].x = __ll2double_rn((int64_t)accumulator[tid]);
|
||||
temp[i].y =
|
||||
__ll2double_rn((int64_t)accumulator[tid + params::degree / 2]);
|
||||
temp[i].x /= (double)std::numeric_limits<Torus>::max();
|
||||
temp[i].y /= (double)std::numeric_limits<Torus>::max();
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
synchronize_threads_in_block();
|
||||
// Move from local memory back to shared memory but as complex
|
||||
tid = threadIdx.x;
|
||||
double2 *fft = (double2 *)accumulator;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = temp[i];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
|
||||
// lwe iteration
|
||||
auto keybundle_out = get_ith_mask_kth_block(
|
||||
keybundle, chunk_id, glwe_id, level_id,
|
||||
polynomial_size, glwe_dimension, level_count);
|
||||
auto keybundle_poly = keybundle_out + poly_id * params::degree / 2;
|
||||
|
||||
copy_polynomial<double2, params::opt / 2, params::degree / params::opt>(
|
||||
fft, keybundle_poly);
|
||||
|
||||
}
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *keybundle_array,
|
||||
Torus *bootstrapping_key, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
|
||||
const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t level_count, uint32_t lwe_offset, uint32_t lwe_chunk_size,
|
||||
uint32_t keybundle_size_per_input, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
|
||||
uint64_t device_memory_size_per_block) {
|
||||
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
int8_t *selected_memory = sharedmem;
|
||||
@@ -58,102 +141,47 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
uint32_t poly_id = blockIdx.y % (glwe_dimension + 1);
|
||||
uint32_t lwe_iteration = (blockIdx.x % lwe_chunk_size + lwe_offset);
|
||||
uint32_t input_idx = blockIdx.x / lwe_chunk_size;
|
||||
uint32_t chunk_id = blockIdx.x % lwe_chunk_size;
|
||||
|
||||
if (lwe_iteration < (lwe_dimension / grouping_factor)) {
|
||||
//
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
|
||||
Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[input_idx + gpu_offset] *
|
||||
(lwe_dimension + 1)];
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
|
||||
|
||||
double2 *keybundle = keybundle_array +
|
||||
// select the input
|
||||
input_idx * keybundle_size_per_input;
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Computes all keybundles
|
||||
uint32_t rev_lwe_iteration =
|
||||
((lwe_dimension / grouping_factor) - lwe_iteration - 1);
|
||||
|
||||
// ////////////////////////////////
|
||||
// Keygen guarantees the first term is a constant term of the polynomial, no
|
||||
// polynomial multiplication required
|
||||
Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id,
|
||||
grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
|
||||
Torus *bsk_poly = bsk_slice + poly_id * params::degree;
|
||||
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
bsk_poly, accumulator);
|
||||
|
||||
// Accumulate the other terms
|
||||
for (int g = 1; g < (1 << grouping_factor); g++) {
|
||||
|
||||
Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
bootstrapping_key, g, rev_lwe_iteration, glwe_id, level_id,
|
||||
grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
|
||||
Torus *bsk_poly = bsk_slice + poly_id * params::degree;
|
||||
|
||||
// Calculates the monomial degree
|
||||
Torus *lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
|
||||
uint32_t monomial_degree = calculates_monomial_degree<Torus, params>(
|
||||
lwe_array_group, g, grouping_factor);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
// Multiply by the bsk element
|
||||
polynomial_product_accumulate_by_monomial<Torus, params>(
|
||||
accumulator, bsk_poly, monomial_degree, false);
|
||||
}
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
double2 *fft = (double2 *)selected_memory;
|
||||
|
||||
// Move accumulator to local memory
|
||||
double2 temp[params::opt / 2];
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
temp[i].x = __ll2double_rn((int64_t)accumulator[tid]);
|
||||
temp[i].y =
|
||||
__ll2double_rn((int64_t)accumulator[tid + params::degree / 2]);
|
||||
temp[i].x /= (double)std::numeric_limits<Torus>::max();
|
||||
temp[i].y /= (double)std::numeric_limits<Torus>::max();
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
synchronize_threads_in_block();
|
||||
// Move from local memory back to shared memory but as complex
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = temp[i];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
|
||||
// lwe iteration
|
||||
auto keybundle_out = get_ith_mask_kth_block(
|
||||
keybundle, blockIdx.x % lwe_chunk_size, glwe_id, level_id,
|
||||
polynomial_size, glwe_dimension, level_count);
|
||||
auto keybundle_poly = keybundle_out + poly_id * params::degree / 2;
|
||||
|
||||
copy_polynomial<double2, params::opt / 2, params::degree / params::opt>(
|
||||
fft, keybundle_poly);
|
||||
compute_multi_bit_programmable_bootstrap_keybundle<Torus, params, SMD>(block_lwe_array_in,
|
||||
keybundle,
|
||||
bootstrapping_key,
|
||||
lwe_dimension,
|
||||
glwe_dimension,
|
||||
polynomial_size,
|
||||
grouping_factor,
|
||||
level_count,
|
||||
lwe_chunk_size,
|
||||
level_id, glwe_id,
|
||||
poly_id,
|
||||
chunk_id,
|
||||
lwe_iteration,
|
||||
accumulator);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *global_accumulator,
|
||||
double2 *global_accumulator_fft, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t lwe_iteration, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
|
||||
__global__ void __launch_bounds__(params::degree / params::opt)
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one(
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes, Torus *global_accumulator,
|
||||
double2 *global_accumulator_fft, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t lwe_iteration, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block) {
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
@@ -179,12 +207,12 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
|
||||
if constexpr (SMD == PARTIALSM)
|
||||
accumulator_fft = (double2 *)sharedmem;
|
||||
|
||||
Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
|
||||
(lwe_dimension + 1)];
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
|
||||
|
||||
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
|
||||
params::degree * (glwe_dimension + 1)];
|
||||
const Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
|
||||
(glwe_dimension + 1)];
|
||||
|
||||
Torus *global_slice =
|
||||
global_accumulator +
|
||||
@@ -202,8 +230,8 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
|
||||
// Initializes the accumulator with the body of LWE
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
2 * params::degree);
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
@@ -242,13 +270,15 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
|
||||
}
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two(
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes, double2 *keybundle_array,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor, uint32_t iteration,
|
||||
uint32_t lwe_offset, uint32_t lwe_chunk_size, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
|
||||
__global__ void __launch_bounds__(params::degree / params::opt)
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_two(
|
||||
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const double2 *__restrict__ keybundle_array, Torus *global_accumulator,
|
||||
double2 *global_accumulator_fft, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset,
|
||||
uint32_t lwe_chunk_size, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block) {
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
@@ -268,11 +298,11 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two(
|
||||
double2 *accumulator_fft = (double2 *)selected_memory;
|
||||
|
||||
//
|
||||
double2 *keybundle = keybundle_array +
|
||||
// select the input
|
||||
blockIdx.x * lwe_chunk_size * level_count *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
(polynomial_size / 2);
|
||||
const double2 *keybundle = keybundle_array +
|
||||
// select the input
|
||||
blockIdx.x * lwe_chunk_size * level_count *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
(polynomial_size / 2);
|
||||
|
||||
double2 *global_accumulator_fft_input =
|
||||
global_accumulator_fft +
|
||||
@@ -312,7 +342,7 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two(
|
||||
if (lwe_iteration + 1 == (lwe_dimension / grouping_factor)) {
|
||||
// Last iteration
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.x + gpu_offset] *
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.x] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
@@ -327,58 +357,35 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two(
|
||||
}
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
|
||||
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size; // accumulator
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
|
||||
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size * 2; // accumulator
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
|
||||
uint64_t get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size; // accumulator
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
|
||||
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size; // accumulator
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t get_buffer_size_multibit_programmable_bootstrap(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size) {
|
||||
|
||||
uint64_t buffer_size = 0;
|
||||
buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
(polynomial_size / 2) * sizeof(double2); // keybundle fft
|
||||
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
|
||||
level_count * (polynomial_size / 2) *
|
||||
sizeof(double2); // global_accumulator_fft
|
||||
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
|
||||
polynomial_size * sizeof(Torus); // global_accumulator
|
||||
|
||||
return buffer_size + buffer_size % sizeof(double2);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
template <typename Torus, typename params>
|
||||
__host__ void scratch_multi_bit_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size = 0) {
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
|
||||
polynomial_size);
|
||||
@@ -469,10 +476,8 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size =
|
||||
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
|
||||
polynomial_size, max_shared_memory);
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size);
|
||||
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::DEFAULT,
|
||||
@@ -486,10 +491,8 @@ __host__ void execute_compute_keybundle(
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t max_shared_memory, uint32_t lwe_chunk_size, int lwe_offset,
|
||||
uint32_t gpu_offset) {
|
||||
uint32_t lwe_chunk_size, int lwe_offset) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
uint32_t chunk_size =
|
||||
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
|
||||
@@ -500,6 +503,8 @@ __host__ void execute_compute_keybundle(
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
|
||||
polynomial_size);
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
auto d_mem = buffer->d_mem_keybundle;
|
||||
auto keybundle_fft = buffer->keybundle_fft;
|
||||
@@ -514,15 +519,15 @@ __host__ void execute_compute_keybundle(
|
||||
<<<grid_keybundle, thds, 0, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, lwe_offset, chunk_size,
|
||||
keybundle_size_per_input, d_mem, full_sm_keybundle, gpu_offset);
|
||||
level_count, lwe_offset, chunk_size, keybundle_size_per_input,
|
||||
d_mem, full_sm_keybundle);
|
||||
else
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
|
||||
<<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, lwe_offset, chunk_size,
|
||||
keybundle_size_per_input, d_mem, 0, gpu_offset);
|
||||
level_count, lwe_offset, chunk_size, keybundle_size_per_input,
|
||||
d_mem, 0);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -534,16 +539,16 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t num_samples, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t max_shared_memory,
|
||||
int j, int lwe_offset, uint32_t gpu_offset) {
|
||||
uint32_t level_count, int j, int lwe_offset) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
uint64_t full_sm_accumulate_step_one =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm_accumulate_step_one =
|
||||
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
|
||||
Torus>(polynomial_size);
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
//
|
||||
auto d_mem = buffer->d_mem_acc_step_one;
|
||||
@@ -560,7 +565,7 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
|
||||
lwe_array_in, lwe_input_indexes, lut_vector, lut_vector_indexes,
|
||||
global_accumulator, global_accumulator_fft, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count,
|
||||
j + lwe_offset, d_mem, full_sm_accumulate_step_one, gpu_offset);
|
||||
j + lwe_offset, d_mem, full_sm_accumulate_step_one);
|
||||
else if (max_shared_memory < full_sm_accumulate_step_one)
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<Torus, params,
|
||||
PARTIALSM>
|
||||
@@ -569,7 +574,7 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
|
||||
lut_vector_indexes, global_accumulator,
|
||||
global_accumulator_fft, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, j + lwe_offset,
|
||||
d_mem, partial_sm_accumulate_step_one, gpu_offset);
|
||||
d_mem, partial_sm_accumulate_step_one);
|
||||
else
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<Torus, params,
|
||||
FULLSM>
|
||||
@@ -578,24 +583,25 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
|
||||
lut_vector_indexes, global_accumulator,
|
||||
global_accumulator_fft, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, j + lwe_offset,
|
||||
d_mem, 0, gpu_offset);
|
||||
d_mem, 0);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void
|
||||
execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, int32_t grouping_factor,
|
||||
uint32_t level_count, uint32_t max_shared_memory, int j,
|
||||
int lwe_offset, uint32_t lwe_chunk_size, uint32_t gpu_offset) {
|
||||
__host__ void execute_step_two(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer,
|
||||
uint32_t num_samples, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
int32_t grouping_factor, uint32_t level_count,
|
||||
int j, int lwe_offset, uint32_t lwe_chunk_size) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
uint64_t full_sm_accumulate_step_two =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
|
||||
polynomial_size);
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
auto d_mem = buffer->d_mem_acc_step_two;
|
||||
auto keybundle_fft = buffer->keybundle_fft;
|
||||
@@ -612,8 +618,7 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
lwe_array_out, lwe_output_indexes, keybundle_fft,
|
||||
global_accumulator, global_accumulator_fft, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count, grouping_factor, j,
|
||||
lwe_offset, lwe_chunk_size, d_mem, full_sm_accumulate_step_two,
|
||||
gpu_offset);
|
||||
lwe_offset, lwe_chunk_size, d_mem, full_sm_accumulate_step_two);
|
||||
else
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_two<Torus, params,
|
||||
FULLSM>
|
||||
@@ -621,27 +626,21 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
stream>>>(lwe_array_out, lwe_output_indexes, keybundle_fft,
|
||||
global_accumulator, global_accumulator_fft, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
grouping_factor, j, lwe_offset, lwe_chunk_size, d_mem, 0,
|
||||
gpu_offset);
|
||||
grouping_factor, j, lwe_offset, lwe_chunk_size, d_mem, 0);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, class params>
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_multi_bit_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t gpu_offset, uint32_t lwe_chunk_size = 0) {
|
||||
cudaSetDevice(gpu_index);
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
// If a chunk size is not passed to this function, select one.
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, num_samples, polynomial_size, max_shared_memory);
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, num_samples, polynomial_size);
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
@@ -650,8 +649,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
|
||||
execute_compute_keybundle<Torus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, max_shared_memory,
|
||||
lwe_chunk_size, lwe_offset, gpu_offset);
|
||||
grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset);
|
||||
// Accumulate
|
||||
uint32_t chunk_size = std::min(
|
||||
lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
@@ -659,14 +657,12 @@ __host__ void host_multi_bit_programmable_bootstrap(
|
||||
execute_step_one<Torus, params>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, max_shared_memory, j,
|
||||
lwe_offset, gpu_offset);
|
||||
polynomial_size, base_log, level_count, j, lwe_offset);
|
||||
|
||||
execute_step_two<Torus, params>(
|
||||
stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, max_shared_memory, j, lwe_offset,
|
||||
lwe_chunk_size, gpu_offset);
|
||||
grouping_factor, level_count, j, lwe_offset, lwe_chunk_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,12 +36,15 @@ namespace cg = cooperative_groups;
|
||||
*/
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_programmable_bootstrap_tbc(
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block, bool support_dsm,
|
||||
uint32_t gpu_offset) {
|
||||
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block, bool support_dsm) {
|
||||
|
||||
cluster_group cluster = this_cluster();
|
||||
|
||||
@@ -77,12 +80,12 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
|
||||
(lwe_dimension + 1)];
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
|
||||
|
||||
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
|
||||
params::degree * (glwe_dimension + 1)];
|
||||
const Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
|
||||
(glwe_dimension + 1)];
|
||||
|
||||
double2 *block_join_buffer =
|
||||
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
|
||||
@@ -93,8 +96,8 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
2 * params::degree);
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
@@ -106,8 +109,8 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
|
||||
// Put "a" in [0, 2N[
|
||||
Torus a_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[i], a_hat,
|
||||
2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
modulus_switch(block_lwe_array_in[i], a_hat,
|
||||
params::log2_degree + 1); // 2 * params::log2_degree + 1);
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
@@ -143,7 +146,7 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
}
|
||||
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] *
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
@@ -157,18 +160,16 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
template <typename Torus, typename params>
|
||||
__host__ void scratch_programmable_bootstrap_tbc(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaSetDevice(gpu_index);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
bool supports_dsm =
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory);
|
||||
Torus>(polynomial_size);
|
||||
|
||||
uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
|
||||
polynomial_size);
|
||||
@@ -180,6 +181,7 @@ __host__ void scratch_programmable_bootstrap_tbc(
|
||||
minimum_sm_tbc =
|
||||
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
|
||||
if (max_shared_memory >= full_sm + minimum_sm_tbc) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
@@ -223,13 +225,11 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_luts, uint32_t max_shared_memory, uint32_t gpu_offset) {
|
||||
cudaSetDevice(gpu_index);
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
auto supports_dsm =
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory);
|
||||
Torus>(polynomial_size);
|
||||
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
// duplicate data for each
|
||||
@@ -244,6 +244,9 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
uint64_t full_dm = full_sm;
|
||||
|
||||
uint64_t partial_dm = full_dm - partial_sm;
|
||||
@@ -278,7 +281,7 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm,
|
||||
supports_dsm, gpu_offset));
|
||||
supports_dsm));
|
||||
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
|
||||
config.dynamicSmemBytes = partial_sm + minimum_sm_tbc;
|
||||
|
||||
@@ -287,7 +290,7 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem,
|
||||
partial_dm, supports_dsm, gpu_offset));
|
||||
partial_dm, supports_dsm));
|
||||
} else {
|
||||
config.dynamicSmemBytes = full_sm + minimum_sm_tbc;
|
||||
|
||||
@@ -296,15 +299,14 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0,
|
||||
supports_dsm, gpu_offset));
|
||||
supports_dsm));
|
||||
}
|
||||
}
|
||||
|
||||
// Verify if the grid size satisfies the cooperative group constraints
|
||||
template <typename Torus, class params>
|
||||
__host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
|
||||
int glwe_dimension, int level_count, int num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
int glwe_dimension, int level_count, int num_samples) {
|
||||
|
||||
// If Cooperative Groups is not supported, no need to check anything else
|
||||
if (!cuda_check_support_cooperative_groups())
|
||||
@@ -318,12 +320,12 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
|
||||
params::degree);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
int thds = params::degree / params::opt;
|
||||
|
||||
// Get the maximum number of active blocks per streaming multiprocessors
|
||||
int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
|
||||
int max_active_blocks_per_sm;
|
||||
|
||||
if (max_shared_memory < partial_sm) {
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
@@ -348,13 +350,13 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ bool
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap(
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory) {
|
||||
bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
uint64_t minimum_sm =
|
||||
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
if (max_shared_memory < minimum_sm) {
|
||||
// If we cannot store a single polynomial in a block shared memory we cannot
|
||||
// use TBC
|
||||
@@ -367,7 +369,7 @@ supports_distributed_shared_memory_on_classic_programmable_bootstrap(
|
||||
template <typename Torus, class params>
|
||||
__host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory) {
|
||||
uint32_t level_count) {
|
||||
|
||||
if (!cuda_check_support_thread_block_clusters() || num_samples > 128)
|
||||
return false;
|
||||
@@ -379,7 +381,7 @@ __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
|
||||
polynomial_size);
|
||||
uint64_t minimum_sm_tbc = 0;
|
||||
if (supports_distributed_shared_memory_on_classic_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory))
|
||||
Torus>(polynomial_size))
|
||||
minimum_sm_tbc =
|
||||
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
@@ -402,6 +404,7 @@ __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
|
||||
* case and it will fail if we try. Thus, since level_count *
|
||||
* (glwe_dimension+1) is usually smaller than 8 at this moment, we will
|
||||
* disable cudaFuncAttributeNonPortableClusterSizeAllowed */
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
if (max_shared_memory < partial_sm + minimum_sm_tbc) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc<Torus, params, NOSM>,
|
||||
|
||||
@@ -18,16 +18,21 @@
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
double2 *keybundle_array, double2 *join_buffer, Torus *global_accumulator,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t lwe_offset, uint32_t lwe_chunk_size,
|
||||
uint32_t keybundle_size_per_input, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block, bool support_dsm,
|
||||
uint32_t gpu_offset) {
|
||||
__global__ void __launch_bounds__(params::degree / params::opt)
|
||||
device_multi_bit_programmable_bootstrap_tbc(
|
||||
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const Torus *__restrict__ bootstrapping_key,
|
||||
double2 *__restrict__ keybundle_array, double2 *join_buffer,
|
||||
Torus *global_accumulator, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t keybundle_size_per_input,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block,
|
||||
bool support_dsm) {
|
||||
|
||||
cluster_group cluster = this_cluster();
|
||||
|
||||
@@ -49,7 +54,8 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
|
||||
selected_memory = &device_mem[block_index * device_memory_size_per_block];
|
||||
}
|
||||
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
Torus *keybundle_accumulator = (Torus *)selected_memory;
|
||||
Torus *accumulator = keybundle_accumulator + polynomial_size;
|
||||
double2 *accumulator_fft =
|
||||
(double2 *)accumulator +
|
||||
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
|
||||
@@ -62,12 +68,12 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
|
||||
(lwe_dimension + 1)];
|
||||
const Torus *__restrict__ block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
|
||||
|
||||
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
|
||||
params::degree * (glwe_dimension + 1)];
|
||||
const Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
|
||||
(glwe_dimension + 1)];
|
||||
|
||||
double2 *block_join_buffer =
|
||||
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
|
||||
@@ -77,27 +83,21 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
|
||||
global_accumulator +
|
||||
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
|
||||
|
||||
double2 *keybundle = keybundle_array +
|
||||
// select the input
|
||||
blockIdx.z * keybundle_size_per_input;
|
||||
double2 *__restrict__ keybundle = keybundle_array +
|
||||
// select the input
|
||||
blockIdx.z * keybundle_size_per_input;
|
||||
|
||||
if (lwe_offset == 0) {
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
2 * params::degree);
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
global_slice, accumulator);
|
||||
}
|
||||
|
||||
for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
|
||||
for (int i = 0; i < lwe_dimension / grouping_factor; i++) {
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
@@ -115,17 +115,28 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
|
||||
// don't modify the same memory space at the same time
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Computes keybundle
|
||||
for(int poly_id = 0; poly_id < glwe_dimension+1; poly_id++){
|
||||
compute_multi_bit_programmable_bootstrap_keybundle<Torus, params, SMD>(
|
||||
block_lwe_array_in,
|
||||
keybundle,
|
||||
bootstrapping_key,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, level_count, (uint32_t)1,
|
||||
(uint32_t)blockIdx.x,(uint32_t)blockIdx.y, (uint32_t)poly_id, (uint32_t)0, (uint32_t)i, keybundle_accumulator);
|
||||
cluster.sync(); synchronize_threads_in_block();
|
||||
|
||||
}
|
||||
|
||||
// Perform G^-1(ACC) * GGSW -> GLWE
|
||||
mul_ggsw_glwe<Torus, cluster_group, params>(
|
||||
accumulator, accumulator_fft, block_join_buffer, keybundle,
|
||||
polynomial_size, glwe_dimension, level_count, i, cluster, support_dsm);
|
||||
polynomial_size, glwe_dimension, level_count, 0, cluster, support_dsm);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] *
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
@@ -137,47 +148,37 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
}
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, global_slice);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
|
||||
uint64_t get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size; // distributed shared memory
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint64_t get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size; // accumulator
|
||||
return sizeof(Torus) * polynomial_size; // accumulator
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size * 2; // accumulator
|
||||
return sizeof(Torus) * polynomial_size * 2+ // accumulator
|
||||
sizeof(Torus) * polynomial_size; // keybundle accumulator
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
template <typename Torus, typename params>
|
||||
__host__ void scratch_tbc_multi_bit_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> **buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size = 0) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
bool supports_dsm =
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory);
|
||||
Torus>(polynomial_size);
|
||||
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
|
||||
@@ -194,6 +195,8 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
|
||||
if (max_shared_memory < full_sm_keybundle) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
@@ -217,44 +220,42 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
|
||||
if (max_shared_memory <
|
||||
partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
device_multi_bit_programmable_bootstrap_tbc<Torus, params,
|
||||
NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
minimum_sm_tbc_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
device_multi_bit_programmable_bootstrap_tbc<Torus, params,
|
||||
NOSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory <
|
||||
full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
device_multi_bit_programmable_bootstrap_tbc<Torus, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
device_multi_bit_programmable_bootstrap_tbc<Torus, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
device_multi_bit_programmable_bootstrap_tbc<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
full_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
device_multi_bit_programmable_bootstrap_tbc<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size =
|
||||
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
|
||||
polynomial_size, max_shared_memory);
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size);
|
||||
*buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::TBC,
|
||||
@@ -262,20 +263,17 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void execute_tbc_external_product_loop(
|
||||
__host__ void execute_tbc(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lwe_array_out, Torus *lwe_output_indexes,Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset,
|
||||
uint32_t gpu_offset) {
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
auto supports_dsm =
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory);
|
||||
Torus>(polynomial_size);
|
||||
|
||||
uint64_t full_dm =
|
||||
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
@@ -289,12 +287,12 @@ __host__ void execute_tbc_external_product_loop(
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
uint32_t keybundle_size_per_input =
|
||||
lwe_chunk_size * level_count * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * (polynomial_size / 2);
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
uint32_t chunk_size =
|
||||
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
uint32_t keybundle_size_per_input =
|
||||
level_count * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * (polynomial_size / 2);
|
||||
|
||||
auto d_mem = buffer->d_mem_acc_tbc;
|
||||
auto keybundle_fft = buffer->keybundle_fft;
|
||||
@@ -324,82 +322,67 @@ __host__ void execute_tbc_external_product_loop(
|
||||
config.dynamicSmemBytes = minimum_dm;
|
||||
check_cuda_error(cudaLaunchKernelEx(
|
||||
&config,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
device_multi_bit_programmable_bootstrap_tbc<Torus, params,
|
||||
NOSM>,
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, keybundle_fft, buffer_fft,
|
||||
global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
base_log, level_count, grouping_factor, lwe_offset, chunk_size,
|
||||
keybundle_size_per_input, d_mem, full_dm, supports_dsm, gpu_offset));
|
||||
base_log, level_count, grouping_factor,
|
||||
keybundle_size_per_input, d_mem, full_dm, supports_dsm));
|
||||
} else if (max_shared_memory < full_dm + minimum_dm) {
|
||||
config.dynamicSmemBytes = partial_dm + minimum_dm;
|
||||
check_cuda_error(cudaLaunchKernelEx(
|
||||
&config,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
device_multi_bit_programmable_bootstrap_tbc<Torus, params,
|
||||
PARTIALSM>,
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, keybundle_fft, buffer_fft,
|
||||
global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
base_log, level_count, grouping_factor, lwe_offset, chunk_size,
|
||||
keybundle_size_per_input, d_mem, partial_dm, supports_dsm, gpu_offset));
|
||||
base_log, level_count, grouping_factor,
|
||||
keybundle_size_per_input, d_mem, full_dm, supports_dsm));
|
||||
} else {
|
||||
config.dynamicSmemBytes = full_dm + minimum_dm;
|
||||
check_cuda_error(cudaLaunchKernelEx(
|
||||
&config,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
device_multi_bit_programmable_bootstrap_tbc<Torus, params,
|
||||
FULLSM>,
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, keybundle_fft, buffer_fft,
|
||||
global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
base_log, level_count, grouping_factor, lwe_offset, chunk_size,
|
||||
keybundle_size_per_input, d_mem, 0, supports_dsm, gpu_offset));
|
||||
base_log, level_count, grouping_factor,
|
||||
keybundle_size_per_input, d_mem, full_dm, supports_dsm));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, class params>
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_tbc_multi_bit_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t gpu_offset, uint32_t lwe_chunk_size = 0) {
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, num_samples, polynomial_size, max_shared_memory);
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, num_samples, polynomial_size);
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
// Compute a keybundle
|
||||
execute_compute_keybundle<Torus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, max_shared_memory,
|
||||
lwe_chunk_size, lwe_offset, gpu_offset);
|
||||
|
||||
// Accumulate
|
||||
execute_tbc_external_product_loop<Torus, params>(
|
||||
execute_tbc<Torus, params>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
|
||||
lwe_input_indexes, lwe_array_out, lwe_output_indexes, bootstrapping_key,
|
||||
buffer,
|
||||
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, lwe_chunk_size,
|
||||
max_shared_memory, lwe_offset, gpu_offset);
|
||||
}
|
||||
grouping_factor, base_log, level_count);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ bool
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory) {
|
||||
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
uint64_t minimum_sm =
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
if (max_shared_memory <= minimum_sm) {
|
||||
// If we cannot store a single polynomial in a block shared memory we
|
||||
// cannot use TBC
|
||||
@@ -412,7 +395,7 @@ supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
template <typename Torus, class params>
|
||||
__host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory) {
|
||||
uint32_t level_count) {
|
||||
|
||||
if (!cuda_check_support_thread_block_clusters())
|
||||
return false;
|
||||
@@ -425,7 +408,7 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
|
||||
polynomial_size);
|
||||
uint64_t minimum_sm_tbc_accumulate = 0;
|
||||
if (supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory))
|
||||
Torus>(polynomial_size))
|
||||
minimum_sm_tbc_accumulate =
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
@@ -448,36 +431,37 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
|
||||
* case and it will fail if we try. Thus, since level_count *
|
||||
* (glwe_dimension+1) is usually smaller than 8 at this moment, we will
|
||||
* disable cudaFuncAttributeNonPortableClusterSizeAllowed */
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
if (max_shared_memory <
|
||||
partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
device_multi_bit_programmable_bootstrap_tbc<Torus, params,
|
||||
NOSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
|
||||
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
|
||||
&cluster_size,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
device_multi_bit_programmable_bootstrap_tbc<Torus, params,
|
||||
NOSM>,
|
||||
&config));
|
||||
} else if (max_shared_memory <
|
||||
full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
device_multi_bit_programmable_bootstrap_tbc<Torus, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
|
||||
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
|
||||
&cluster_size,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
device_multi_bit_programmable_bootstrap_tbc<Torus, params,
|
||||
PARTIALSM>,
|
||||
&config));
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
device_multi_bit_programmable_bootstrap_tbc<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
|
||||
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
|
||||
&cluster_size,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
device_multi_bit_programmable_bootstrap_tbc<Torus, params,
|
||||
FULLSM>,
|
||||
&config));
|
||||
}
|
||||
@@ -485,7 +469,7 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
|
||||
return cluster_size >= level_count * (glwe_dimension + 1);
|
||||
}
|
||||
|
||||
template __host__ bool
|
||||
template bool
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<uint64_t>(
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory);
|
||||
uint32_t polynomial_size);
|
||||
#endif // FASTMULTIBIT_PBS_H
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user