mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-04-28 03:01:21 -04:00
Compare commits
100 Commits
go/wip/imp
...
feat/gpu/f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d45cb74476 | ||
|
|
772c049681 | ||
|
|
516ae67990 | ||
|
|
db61b0bb9b | ||
|
|
dc8091ad0f | ||
|
|
3ccfb9616a | ||
|
|
83dc9b9453 | ||
|
|
4fe72a15c0 | ||
|
|
2a18d6fa32 | ||
|
|
8c2358a9e1 | ||
|
|
c3def17ad8 | ||
|
|
9da58f68c7 | ||
|
|
5c226e98ba | ||
|
|
27ccfbd939 | ||
|
|
40dd2a6ecc | ||
|
|
2d9c13569f | ||
|
|
36deaec607 | ||
|
|
fcc0378c98 | ||
|
|
b31fbf5f23 | ||
|
|
b5c614520a | ||
|
|
46cf465637 | ||
|
|
11a0fe2b40 | ||
|
|
7dcb5bd4a6 | ||
|
|
55a112cca5 | ||
|
|
992c062db0 | ||
|
|
58f5a2c593 | ||
|
|
14c10c374e | ||
|
|
0d202e6e03 | ||
|
|
4aaa3b67d6 | ||
|
|
609e24bf7c | ||
|
|
5cd5fbe1f2 | ||
|
|
089efd7b17 | ||
|
|
a582aadd5d | ||
|
|
19d0a3d8c3 | ||
|
|
af49b99724 | ||
|
|
cf713821da | ||
|
|
a5fb99ee36 | ||
|
|
ac1284679e | ||
|
|
9059ddeacc | ||
|
|
904ffa729b | ||
|
|
c9b4ee84ae | ||
|
|
d56e7e0b2a | ||
|
|
6d2206e5ac | ||
|
|
015b11d309 | ||
|
|
e390e8eb5a | ||
|
|
6a161fef0a | ||
|
|
9fbd96f016 | ||
|
|
a45b7b3974 | ||
|
|
e59a680407 | ||
|
|
cf7968ac6c | ||
|
|
7aa454ee97 | ||
|
|
0aee4c568e | ||
|
|
f9e8df49d2 | ||
|
|
cf56e5853f | ||
|
|
b2e8ef6010 | ||
|
|
bb327b09ae | ||
|
|
5a664aa30d | ||
|
|
4264ba2e20 | ||
|
|
b18aa0df54 | ||
|
|
a501285206 | ||
|
|
d28040342c | ||
|
|
b041608d25 | ||
|
|
eac30027e9 | ||
|
|
aaba7e5916 | ||
|
|
d29ed6b60c | ||
|
|
9ee18dd2c7 | ||
|
|
6ef22e8cb9 | ||
|
|
fa7a6281ad | ||
|
|
5c189d6bf3 | ||
|
|
f8bde7fbde | ||
|
|
f9c4627946 | ||
|
|
5dd6d8d569 | ||
|
|
5e3b793fd7 | ||
|
|
295b6608ee | ||
|
|
5c42fc950e | ||
|
|
ff6e9cab63 | ||
|
|
e88222987a | ||
|
|
bcae0f1beb | ||
|
|
a6a5716e37 | ||
|
|
829b00bb6d | ||
|
|
de1cc0a863 | ||
|
|
69b6c3a353 | ||
|
|
2fcde61e98 | ||
|
|
c22f6ff70e | ||
|
|
fcf7e66d43 | ||
|
|
fc28ea5a30 | ||
|
|
8680e1de0a | ||
|
|
daf57f5665 | ||
|
|
ccf0dc3ad8 | ||
|
|
ba5e717183 | ||
|
|
615ed3d5db | ||
|
|
dda93889da | ||
|
|
748b88e905 | ||
|
|
612657260f | ||
|
|
6ee3eb17b9 | ||
|
|
c1374a0e10 | ||
|
|
a9601fc47d | ||
|
|
bd255cd958 | ||
|
|
6fe36799fd | ||
|
|
02419d6852 |
@@ -1,2 +1,6 @@
|
||||
[alias]
|
||||
xtask = "run --manifest-path ./tasks/Cargo.toml --"
|
||||
|
||||
# Accessed by wasm-bindgen when testing for the wasm target
|
||||
[target.wasm32-unknown-unknown]
|
||||
runner = 'wasm-bindgen-test-runner'
|
||||
|
||||
@@ -26,7 +26,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -50,7 +50,7 @@ jobs:
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -100,7 +100,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
25
.github/workflows/aws_tfhe_fast_tests.yml
vendored
25
.github/workflows/aws_tfhe_fast_tests.yml
vendored
@@ -57,18 +57,19 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
|
||||
uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
dependencies:
|
||||
- tfhe/Cargo.toml
|
||||
- concrete-csprng/**
|
||||
- tfhe-csprng/**
|
||||
- tfhe-fft/**
|
||||
- tfhe-zk-pok/**
|
||||
- utils/tfhe-versionable/**
|
||||
- utils/tfhe-versionable-derive/**
|
||||
csprng:
|
||||
- concrete-csprng/**
|
||||
- tfhe-csprng/**
|
||||
zk_pok:
|
||||
- tfhe-zk-pok/**
|
||||
versionable:
|
||||
@@ -131,7 +132,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -157,14 +158,14 @@ jobs:
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Run concrete-csprng tests
|
||||
- name: Run tfhe-csprng tests
|
||||
if: needs.should-run.outputs.csprng_test == 'true'
|
||||
run: |
|
||||
make test_concrete_csprng
|
||||
make test_tfhe_csprng
|
||||
|
||||
- name: Run tfhe-zk-pok tests
|
||||
if: needs.should-run.outputs.zk_pok_test == 'true'
|
||||
@@ -244,9 +245,13 @@ jobs:
|
||||
run: |
|
||||
make test_high_level_api
|
||||
|
||||
- name: Run safe deserialization tests
|
||||
- name: Run safe serialization tests
|
||||
run: |
|
||||
make test_safe_deserialization
|
||||
make test_safe_serialization
|
||||
|
||||
- name: Run zk tests
|
||||
run: |
|
||||
make test_zk
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -264,7 +269,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
11
.github/workflows/aws_tfhe_integer_tests.yml
vendored
11
.github/workflows/aws_tfhe_integer_tests.yml
vendored
@@ -46,13 +46,14 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
|
||||
uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
integer:
|
||||
- tfhe/Cargo.toml
|
||||
- concrete-csprng/**
|
||||
- tfhe-csprng/**
|
||||
- tfhe-fft/**
|
||||
- tfhe-zk-pok/**
|
||||
- tfhe/src/core_crypto/**
|
||||
- tfhe/src/shortint/**
|
||||
@@ -72,7 +73,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -96,7 +97,7 @@ jobs:
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -137,7 +138,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -46,13 +46,14 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
|
||||
uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
integer:
|
||||
- tfhe/Cargo.toml
|
||||
- concrete-csprng/**
|
||||
- tfhe-csprng/**
|
||||
- tfhe-fft/**
|
||||
- tfhe-zk-pok/**
|
||||
- tfhe/src/core_crypto/**
|
||||
- tfhe/src/shortint/**
|
||||
@@ -72,7 +73,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -96,7 +97,7 @@ jobs:
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -141,7 +142,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
31
.github/workflows/aws_tfhe_tests.yml
vendored
31
.github/workflows/aws_tfhe_tests.yml
vendored
@@ -40,6 +40,9 @@ jobs:
|
||||
shortint_test: ${{ env.IS_PULL_REQUEST == 'false' ||
|
||||
steps.changed-files.outputs.shortint_any_changed ||
|
||||
steps.changed-files.outputs.dependencies_any_changed }}
|
||||
strings_test: ${{ env.IS_PULL_REQUEST == 'false' ||
|
||||
steps.changed-files.outputs.strings_any_changed ||
|
||||
steps.changed-files.outputs.dependencies_any_changed }}
|
||||
high_level_api_test: ${{ env.IS_PULL_REQUEST == 'false' ||
|
||||
steps.changed-files.outputs.high_level_api_any_changed ||
|
||||
steps.changed-files.outputs.dependencies_any_changed }}
|
||||
@@ -63,16 +66,17 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
|
||||
uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
dependencies:
|
||||
- tfhe/Cargo.toml
|
||||
- concrete-csprng/**
|
||||
- tfhe-csprng/**
|
||||
- tfhe-fft/**
|
||||
- tfhe-zk-pok/**
|
||||
csprng:
|
||||
- concrete-csprng/**
|
||||
- tfhe-csprng/**
|
||||
zk_pok:
|
||||
- tfhe-zk-pok/**
|
||||
core_crypto:
|
||||
@@ -83,6 +87,11 @@ jobs:
|
||||
shortint:
|
||||
- tfhe/src/core_crypto/**
|
||||
- tfhe/src/shortint/**
|
||||
strings:
|
||||
- tfhe/src/core_crypto/**
|
||||
- tfhe/src/shortint/**
|
||||
- tfhe/src/integer/**
|
||||
- tfhe/src/strings/**
|
||||
high_level_api:
|
||||
- tfhe/src/**
|
||||
- '!tfhe/src/c_api/**'
|
||||
@@ -112,6 +121,7 @@ jobs:
|
||||
steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.boolean_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.shortint_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.strings_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.high_level_api_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.c_api_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.examples_any_changed == 'true' ||
|
||||
@@ -131,7 +141,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -157,14 +167,14 @@ jobs:
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Run concrete-csprng tests
|
||||
- name: Run tfhe-csprng tests
|
||||
if: needs.should-run.outputs.csprng_test == 'true'
|
||||
run: |
|
||||
make test_concrete_csprng
|
||||
make test_tfhe_csprng
|
||||
|
||||
- name: Run tfhe-zk-pok tests
|
||||
if: needs.should-run.outputs.zk_pok_test == 'true'
|
||||
@@ -201,6 +211,11 @@ jobs:
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_shortint_ci
|
||||
|
||||
- name: Run strings tests
|
||||
if: needs.should-run.outputs.strings_test == 'true'
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_strings
|
||||
|
||||
- name: Run high-level API tests
|
||||
if: needs.should-run.outputs.high_level_api_test == 'true'
|
||||
run: |
|
||||
@@ -234,7 +249,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
10
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
10
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
@@ -27,7 +27,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -51,7 +51,7 @@ jobs:
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -99,6 +99,10 @@ jobs:
|
||||
run: |
|
||||
make test_web_js_api_parallel_chrome_ci
|
||||
|
||||
- name: Run x86_64/wasm zk compatibility tests
|
||||
run: |
|
||||
make test_zk_wasm_x86_compat_ci
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
@@ -115,7 +119,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
14
.github/workflows/benchmark_boolean.yml
vendored
14
.github/workflows/benchmark_boolean.yml
vendored
@@ -29,7 +29,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -61,13 +61,8 @@ jobs:
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -85,8 +80,7 @@ jobs:
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
--name-suffix avx512
|
||||
|
||||
- name: Measure key sizes
|
||||
run: |
|
||||
@@ -133,7 +127,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
14
.github/workflows/benchmark_core_crypto.yml
vendored
14
.github/workflows/benchmark_core_crypto.yml
vendored
@@ -26,7 +26,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -57,13 +57,8 @@ jobs:
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -83,8 +78,7 @@ jobs:
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--name-suffix avx512 \
|
||||
--walk-subdirs \
|
||||
--throughput
|
||||
--walk-subdirs
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
|
||||
@@ -121,7 +115,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
11
.github/workflows/benchmark_erc20.yml
vendored
11
.github/workflows/benchmark_erc20.yml
vendored
@@ -29,7 +29,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -62,13 +62,8 @@ jobs:
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -129,7 +124,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
9
.github/workflows/benchmark_gpu_4090.yml
vendored
9
.github/workflows/benchmark_gpu_4090.yml
vendored
@@ -54,7 +54,7 @@ jobs:
|
||||
echo "FAST_BENCH=TRUE" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -79,8 +79,7 @@ jobs:
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--throughput
|
||||
--walk-subdirs
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
|
||||
@@ -127,7 +126,7 @@ jobs:
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -154,7 +153,7 @@ jobs:
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--throughput
|
||||
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
|
||||
|
||||
10
.github/workflows/benchmark_gpu_core_crypto.yml
vendored
10
.github/workflows/benchmark_gpu_core_crypto.yml
vendored
@@ -27,7 +27,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -64,7 +64,6 @@ jobs:
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
@@ -85,7 +84,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -126,8 +125,7 @@ jobs:
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--name-suffix avx512 \
|
||||
--walk-subdirs \
|
||||
--throughput
|
||||
--walk-subdirs
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
|
||||
@@ -169,7 +167,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
212
.github/workflows/benchmark_gpu_erc20.yml
vendored
212
.github/workflows/benchmark_gpu_erc20.yml
vendored
@@ -1,195 +1,41 @@
|
||||
# Run ERC20 benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
|
||||
name: ERC20 GPU H100 benchmarks
|
||||
# Run CUDA ERC20 benchmarks on a Hyperstack VM and return parsed results to Slab CI bot.
|
||||
name: Cuda ERC20 benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 5a.m.
|
||||
- cron: '0 5 * * 6'
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
inputs:
|
||||
profile:
|
||||
description: "Instance type"
|
||||
required: true
|
||||
type: choice
|
||||
options:
|
||||
- "l40 (n3-L40x1)"
|
||||
- "single-h100 (n3-H100x1)"
|
||||
- "2-h100 (n3-H100x2)"
|
||||
- "multi-h100 (n3-H100x8)"
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-erc20-benchmarks)
|
||||
parse-inputs:
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
profile: ${{ steps.parse_profile.outputs.profile }}
|
||||
hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
cuda-erc20-benchmarks:
|
||||
name: Execute GPU integer benchmarks
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
- name: Parse profile
|
||||
id: parse_profile
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
echo "profile=$(echo '${{ inputs.profile }}' | sed 's|\(.*\)[[:space:]](.*)|\1|')" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
- name: Parse hardware name
|
||||
id: parse_hardware_name
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "name=$(echo '${{ inputs.profile }}' | sed 's|.*[[:space:]](\(.*\))|\1|')" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CUDA_PATH=$CUDA_PATH";
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run benchmarks
|
||||
run: |
|
||||
make bench_hlapi_erc20_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "n3-H100x1" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512
|
||||
|
||||
- name: Parse PBS counts
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py tfhe/erc20_pbs_count.csv ${{ env.RESULTS_FILENAME }} \
|
||||
--object-sizes \
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
|
||||
with:
|
||||
name: ${{ github.sha }}_erc20
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-erc20-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-erc20-benchmarks.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cuda-erc20-benchmarks.result }}
|
||||
SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ needs.cuda-erc20-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-erc20-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-erc20-benchmarks, slack-notify ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-erc20-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
run-benchmarks:
|
||||
name: Run benchmarks
|
||||
needs: parse-inputs
|
||||
uses: ./.github/workflows/benchmark_gpu_erc20_common.yml
|
||||
with:
|
||||
profile: ${{ needs.parse-inputs.outputs.profile }}
|
||||
hardware_name: ${{ needs.parse-inputs.outputs.hardware_name }}
|
||||
secrets: inherit
|
||||
|
||||
@@ -1,15 +1,40 @@
|
||||
# Run integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
|
||||
name: Integer 2xH100 benchmarks
|
||||
# Run ERC20 benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
|
||||
name: Cuda ERC20 benchmarks - common
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
- cron: '0 1 * * 6'
|
||||
workflow_call:
|
||||
inputs:
|
||||
backend:
|
||||
type: string
|
||||
default: hyperstack
|
||||
profile:
|
||||
type: string
|
||||
required: true
|
||||
hardware_name:
|
||||
type: string
|
||||
required: true
|
||||
secrets:
|
||||
FHE_ACTIONS_TOKEN:
|
||||
required: true
|
||||
SLAB_ACTION_TOKEN:
|
||||
required: true
|
||||
SLAB_BASE_URL:
|
||||
required: true
|
||||
SLAB_URL:
|
||||
required: true
|
||||
JOB_SECRET:
|
||||
required: true
|
||||
SLACK_CHANNEL:
|
||||
required: true
|
||||
BOT_USERNAME:
|
||||
required: true
|
||||
SLACK_WEBHOOK:
|
||||
required: true
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
@@ -20,37 +45,32 @@ env:
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-integer-full-2-gpu-benchmarks)
|
||||
name: Setup instance (cuda-erc20-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'schedule' ||
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: 2-h100
|
||||
backend: ${{ inputs.backend }}
|
||||
profile: ${{ inputs.profile }}
|
||||
|
||||
cuda-integer-full-2-gpu-benchmarks:
|
||||
name: Execute 2xH100 integer benchmarks
|
||||
cuda-erc20-benchmarks:
|
||||
name: Cuda ERC20 benchmarks (${{ inputs.profile }})
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
continue-on-error: true
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
command: [integer_multi_bit]
|
||||
op_flavor: [default]
|
||||
# explicit include-based build matrix, of known valid options
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
@@ -63,7 +83,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
@@ -91,7 +111,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -115,41 +135,40 @@ jobs:
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
- name: Run benchmarks
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
|
||||
make bench_hlapi_erc20_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "n3-H100x2" \
|
||||
--hardware "${{ inputs.hardware_name }}" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
--name-suffix avx512
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
name: ${{ github.sha }}_erc20
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
@@ -158,26 +177,26 @@ jobs:
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-integer-full-2-gpu-benchmarks ]
|
||||
needs: [ setup-instance, cuda-erc20-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-integer-full-2-gpu-benchmarks.result != 'skipped' && failure() }}
|
||||
if: ${{ always() && needs.cuda-erc20-benchmarks.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cuda-integer-full-2-gpu-benchmarks.result }}
|
||||
SLACK_MESSAGE: "Integer GPU 2xH100 benchmarks finished with status: ${{ needs.cuda-integer-full-2-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_COLOR: ${{ needs.cuda-erc20-benchmarks.result }}
|
||||
SLACK_MESSAGE: "Cuda ERC20 benchmarks (${{ inputs.profile }}) finished with status: ${{ needs.cuda-erc20-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-integer-full-2-gpu-benchmarks)
|
||||
name: Teardown instance (cuda-erc20-${{ inputs.profile }}-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-integer-full-2-gpu-benchmarks ]
|
||||
needs: [ setup-instance, cuda-erc20-benchmarks, slack-notify ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -191,4 +210,4 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-integer-full-2-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-erc20-${{ inputs.profile }}-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
35
.github/workflows/benchmark_gpu_erc20_weekly.yml
vendored
Normal file
35
.github/workflows/benchmark_gpu_erc20_weekly.yml
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
# Run CUDA ERC20 benchmarks on multiple Hyperstack VMs and return parsed results to Slab CI bot.
|
||||
name: Cuda ERC20 weekly benchmarks
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 5a.m.
|
||||
- cron: '0 5 * * 6'
|
||||
|
||||
jobs:
|
||||
run-benchmarks-1-h100:
|
||||
name: Run benchmarks (1xH100)
|
||||
if: github.repository == 'zama-ai/tfhe-rs'
|
||||
uses: ./.github/workflows/benchmark_gpu_erc20_common.yml
|
||||
with:
|
||||
profile: single-h100
|
||||
hardware_name: n3-H100x1
|
||||
secrets: inherit
|
||||
|
||||
run-benchmarks-2-h100:
|
||||
name: Run benchmarks (2xH100)
|
||||
if: github.repository == 'zama-ai/tfhe-rs'
|
||||
uses: ./.github/workflows/benchmark_gpu_erc20_common.yml
|
||||
with:
|
||||
profile: 2-h100
|
||||
hardware_name: n3-H100x2
|
||||
secrets: inherit
|
||||
|
||||
run-benchmarks-8-h100:
|
||||
name: Run benchmarks (8xH100)
|
||||
if: github.repository == 'zama-ai/tfhe-rs'
|
||||
uses: ./.github/workflows/benchmark_gpu_erc20_common.yml
|
||||
with:
|
||||
profile: multi-h100
|
||||
hardware_name: n3-H100x8
|
||||
secrets: inherit
|
||||
255
.github/workflows/benchmark_gpu_integer.yml
vendored
255
.github/workflows/benchmark_gpu_integer.yml
vendored
@@ -1,201 +1,78 @@
|
||||
# Run integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
|
||||
name: Integer GPU benchmarks
|
||||
# Run CUDA benchmarks on a Hyperstack VM and return parsed results to Slab CI bot.
|
||||
name: Cuda benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
inputs:
|
||||
profile:
|
||||
description: "Instance type"
|
||||
required: true
|
||||
type: choice
|
||||
options:
|
||||
- "l40 (n3-L40x1)"
|
||||
- "single-h100 (n3-H100x1)"
|
||||
- "2-h100 (n3-H100x2)"
|
||||
- "4-h100 (n3-H100x4)"
|
||||
- "multi-h100 (n3-H100x8)"
|
||||
- "multi-h100-nvlink (n3-H100x8-NVLink)"
|
||||
- "multi-a100-nvlink (n3-A100x8-NVLink)"
|
||||
command:
|
||||
description: "Benchmark command to run"
|
||||
type: choice
|
||||
default: integer_multi_bit
|
||||
options:
|
||||
- integer
|
||||
- integer_multi_bit
|
||||
- integer_compression
|
||||
- pbs
|
||||
- ks
|
||||
op_flavor:
|
||||
description: "Operations set to run"
|
||||
type: choice
|
||||
default: default
|
||||
options:
|
||||
- default
|
||||
- fast_default
|
||||
- unchecked
|
||||
all_precisions:
|
||||
description: "Run all precisions"
|
||||
type: boolean
|
||||
default: false
|
||||
bench_type:
|
||||
description: "Benchmarks type"
|
||||
type: choice
|
||||
default: latency
|
||||
options:
|
||||
- latency
|
||||
- throughput
|
||||
- both
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-integer-benchmarks)
|
||||
parse-inputs:
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
profile: ${{ steps.parse_profile.outputs.profile }}
|
||||
hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
cuda-integer-benchmarks:
|
||||
name: Execute GPU integer benchmarks
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
- name: Parse profile
|
||||
id: parse_profile
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
echo "profile=$(echo '${{ inputs.profile }}' | sed 's|\(.*\)[[:space:]](.*)|\1|')" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
- name: Parse hardware name
|
||||
id: parse_hardware_name
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "name=$(echo '${{ inputs.profile }}' | sed 's|.*[[:space:]](\(.*\))|\1|')" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CUDA_PATH=$CUDA_PATH";
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "n3-H100x1" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-integer-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-integer-benchmarks.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cuda-integer-benchmarks.result }}
|
||||
SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ needs.cuda-integer-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-integer-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-integer-benchmarks, slack-notify ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-integer-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
run-benchmarks:
|
||||
name: Run benchmarks
|
||||
needs: parse-inputs
|
||||
uses: ./.github/workflows/benchmark_gpu_integer_common.yml
|
||||
with:
|
||||
profile: ${{ needs.parse-inputs.outputs.profile }}
|
||||
hardware_name: ${{ needs.parse-inputs.outputs.hardware_name }}
|
||||
command: ${{ inputs.command }}
|
||||
op_flavor: ${{ inputs.op_flavor }}
|
||||
bench_type: ${{ inputs.bench_type }}
|
||||
all_precisions: ${{ inputs.all_precisions }}
|
||||
secrets: inherit
|
||||
|
||||
@@ -1,21 +1,47 @@
|
||||
# Run 64-bit multi-bit integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
|
||||
name: Integer multi GPU Multi-bit benchmarks
|
||||
# Run integer benchmarks on CUDA instance and return parsed results to Slab CI bot.
|
||||
name: Cuda benchmarks - common
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
workflow_call:
|
||||
inputs:
|
||||
backend:
|
||||
type: string
|
||||
default: hyperstack
|
||||
profile:
|
||||
type: string
|
||||
required: true
|
||||
hardware_name:
|
||||
type: string
|
||||
required: true
|
||||
command: # Use a comma separated values to generate an array
|
||||
type: string
|
||||
required: true
|
||||
op_flavor: # Use a comma separated values to generate an array
|
||||
type: string
|
||||
required: true
|
||||
bench_type:
|
||||
type: string
|
||||
default: latency
|
||||
all_precisions:
|
||||
description: "Run all precisions"
|
||||
type: boolean
|
||||
default: false
|
||||
fast_default:
|
||||
description: "Run only deduplicated default operations without scalar variants"
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
- cron: '0 1 * * 6'
|
||||
secrets:
|
||||
FHE_ACTIONS_TOKEN:
|
||||
required: true
|
||||
SLAB_ACTION_TOKEN:
|
||||
required: true
|
||||
SLAB_BASE_URL:
|
||||
required: true
|
||||
SLAB_URL:
|
||||
required: true
|
||||
JOB_SECRET:
|
||||
required: true
|
||||
SLACK_CHANNEL:
|
||||
required: true
|
||||
BOT_USERNAME:
|
||||
required: true
|
||||
SLACK_WEBHOOK:
|
||||
required: true
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
@@ -28,32 +54,82 @@ env:
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
FAST_BENCH: TRUE
|
||||
BENCH_OP_FLAVOR: default
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-integer-multi-bit-multi-gpu-benchmarks)
|
||||
prepare-matrix:
|
||||
name: Prepare operations matrix
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
command: ${{ steps.set_command.outputs.command }}
|
||||
op_flavor: ${{ steps.set_op_flavor.outputs.op_flavor }}
|
||||
bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
|
||||
steps:
|
||||
- name: Set single command
|
||||
if: ${{ !contains(inputs.command, ',')}}
|
||||
run: |
|
||||
echo "COMMAND=[\"${{ inputs.command }}\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set multiple commands
|
||||
if: ${{ contains(inputs.command, ',')}}
|
||||
run: |
|
||||
PARSED_COMMAND=$(echo "${{ inputs.command }}" | sed 's/[[:space:]]*,[[:space:]]*/\\", \\"/g')
|
||||
echo "COMMAND=[\"${PARSED_COMMAND}\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set single operations flavor
|
||||
if: ${{ !contains(inputs.op_flavor, ',')}}
|
||||
run: |
|
||||
echo "OP_FLAVOR=[\"${{ inputs.op_flavor }}\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set multiple operations flavors
|
||||
if: ${{ contains(inputs.op_flavor, ',')}}
|
||||
run: |
|
||||
PARSED_OP_FLAVOR=$(echo "${{ inputs.op_flavor }}" | sed 's/[[:space:]]*,[[:space:]]*/\\", \\"/g')
|
||||
echo "OP_FLAVOR=[\"${PARSED_OP_FLAVOR}\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set benchmark types
|
||||
run: |
|
||||
if [[ "${{ inputs.bench_type }}" == "both" ]]; then
|
||||
echo "BENCH_TYPE=[\"latency\", \"throughput\"]" >> "${GITHUB_ENV}"
|
||||
else
|
||||
echo "BENCH_TYPE=[\"${{ inputs.bench_type }}\"]" >> "${GITHUB_ENV}"
|
||||
fi
|
||||
|
||||
- name: Set command output
|
||||
id: set_command
|
||||
run: |
|
||||
echo "command=${{ toJSON(env.COMMAND) }}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Set operation flavor output
|
||||
id: set_op_flavor
|
||||
run: |
|
||||
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Set benchmark types output
|
||||
id: set_bench_type
|
||||
run: |
|
||||
echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-${{ inputs.profile }}-benchmarks)
|
||||
needs: prepare-matrix
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
github.event_name == 'workflow_dispatch' }}
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: multi-h100
|
||||
backend: ${{ inputs.backend }}
|
||||
profile: ${{ inputs.profile }}
|
||||
|
||||
cuda-integer-multi-bit-multi-gpu-benchmarks:
|
||||
name: Execute multi GPU integer multi-bit benchmarks
|
||||
needs: setup-instance
|
||||
cuda-benchmarks:
|
||||
name: Cuda benchmarks (${{ inputs.profile }})
|
||||
needs: [ prepare-matrix, setup-instance ]
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
continue-on-error: true
|
||||
@@ -61,6 +137,10 @@ jobs:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: ${{ fromJSON(needs.prepare-matrix.outputs.command) }}
|
||||
op_flavor: ${{ fromJSON(needs.prepare-matrix.outputs.op_flavor) }}
|
||||
bench_type: ${{ fromJSON(needs.prepare-matrix.outputs.bench_type) }}
|
||||
# explicit include-based build matrix, of known valid options
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
@@ -101,7 +181,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -132,42 +212,36 @@ jobs:
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Should run benchmarks with all precisions
|
||||
if: inputs.all_precisions
|
||||
run: |
|
||||
echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Should run fast subset benchmarks
|
||||
if: inputs.fast_default
|
||||
- name: Run benchmarks
|
||||
run: |
|
||||
echo "BENCH_OP_FLAVOR=fast_default" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run multi-bit benchmarks with AVX512
|
||||
run: |
|
||||
make bench_unsigned_integer_multi_bit_gpu
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} BENCH_TYPE=${{ matrix.bench_type }} bench_${{ matrix.command }}_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "n3-H100x8" \
|
||||
--hardware "${{ inputs.hardware_name }}" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
--name-suffix avx512
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
@@ -178,26 +252,26 @@ jobs:
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-integer-multi-bit-multi-gpu-benchmarks ]
|
||||
needs: [ setup-instance, cuda-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result != 'skipped' && failure() }}
|
||||
if: ${{ always() && needs.cuda-benchmarks.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result }}
|
||||
SLACK_MESSAGE: "Integer multi GPU multi-bit benchmarks finished with status: ${{ needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_COLOR: ${{ needs.cuda-benchmarks.result }}
|
||||
SLACK_MESSAGE: "Cuda benchmarks (${{ inputs.profile }}) finished with status: ${{ needs.cuda-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-integer-multi-bit-multi-gpu-benchmarks)
|
||||
name: Teardown instance (cuda-${{ inputs.profile }}-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-integer-multi-bit-multi-gpu-benchmarks ]
|
||||
needs: [ setup-instance, cuda-benchmarks, slack-notify ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -211,4 +285,4 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-integer-multi-bit-multi-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-${{ inputs.profile }}-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
200
.github/workflows/benchmark_gpu_integer_full.yml
vendored
200
.github/workflows/benchmark_gpu_integer_full.yml
vendored
@@ -1,200 +0,0 @@
|
||||
# Run all integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
|
||||
name: Integer GPU full benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
- cron: '0 1 * * 6'
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-integer-full-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'schedule' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
cuda-integer-full-benchmarks:
|
||||
name: Execute GPU integer benchmarks for all operations flavor
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
continue-on-error: true
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: [integer, integer_multi_bit]
|
||||
op_flavor: [default]
|
||||
# explicit include-based build matrix, of known valid options
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CUDA_PATH=$CUDA_PATH";
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
|
||||
|
||||
# Run these benchmarks only once
|
||||
- name: Run compression benchmarks with AVX512
|
||||
if: matrix.op_flavor == 'default' && matrix.command == 'integer'
|
||||
run: |
|
||||
make bench_integer_compression_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "n3-H100x1" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-integer-full-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-integer-full-benchmarks.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cuda-integer-full-benchmarks.result }}
|
||||
SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ needs.cuda-integer-full-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-integer-full-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-integer-full-benchmarks, slack-notify ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-integer-full-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
@@ -1,224 +0,0 @@
|
||||
# Run integer benchmarks with multi-bit cryptographic parameters on an instance and return parsed results to Slab CI bot.
|
||||
name: Integer GPU Multi-bit benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
all_precisions:
|
||||
description: "Run all precisions"
|
||||
type: boolean
|
||||
default: false
|
||||
fast_default:
|
||||
description: "Run only deduplicated default operations without scalar variants"
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
- cron: '0 1 * * 6'
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
FAST_BENCH: TRUE
|
||||
BENCH_OP_FLAVOR: default
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-integer-multi-bit-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'schedule' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
cuda-integer-multi-bit-benchmarks:
|
||||
name: Execute GPU integer multi-bit benchmarks
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CUDA_PATH=$CUDA_PATH";
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Should run benchmarks with all precisions
|
||||
if: inputs.all_precisions
|
||||
run: |
|
||||
echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Should run fast subset benchmarks
|
||||
if: inputs.fast_default
|
||||
run: |
|
||||
echo "BENCH_OP_FLAVOR=fast_default" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run multi-bit benchmarks with AVX512
|
||||
run: |
|
||||
make bench_unsigned_integer_multi_bit_gpu
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "n3-H100x1" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-integer-multi-bit-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-integer-multi-bit-benchmarks.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cuda-integer-multi-bit-benchmarks.result }}
|
||||
SLACK_MESSAGE: "Integer GPU multi-bit benchmarks finished with status: ${{ needs.cuda-integer-multi-bit-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-integer-full-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-integer-multi-bit-benchmarks, slack-notify ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-integer-multi-bit-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
@@ -1,194 +0,0 @@
|
||||
# Run all integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
|
||||
name: Integer multi GPU full benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
- cron: '0 1 * * 6'
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-integer-full-multi-gpu-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'schedule' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: multi-h100
|
||||
|
||||
cuda-integer-full-multi-gpu-benchmarks:
|
||||
name: Execute multi GPU integer benchmarks
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
continue-on-error: true
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: [integer_multi_bit]
|
||||
op_flavor: [default]
|
||||
# explicit include-based build matrix, of known valid options
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CUDA_PATH=$CUDA_PATH";
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "n3-H100x8" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-integer-full-multi-gpu-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-integer-full-multi-gpu-benchmarks.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cuda-integer-full-multi-gpu-benchmarks.result }}
|
||||
SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ needs.cuda-integer-full-multi-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-integer-full-multi-gpu-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-integer-full-multi-gpu-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-integer-full-multi-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
60
.github/workflows/benchmark_gpu_integer_weekly.yml
vendored
Normal file
60
.github/workflows/benchmark_gpu_integer_weekly.yml
vendored
Normal file
@@ -0,0 +1,60 @@
|
||||
# Run CUDA benchmarks on multiple Hyperstack VMs and return parsed results to Slab CI bot.
|
||||
name: Cuda weekly benchmarks
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
- cron: '0 1 * * 6'
|
||||
|
||||
jobs:
|
||||
run-benchmarks-1-h100:
|
||||
name: Run benchmarks (1xH100)
|
||||
if: github.repository == 'zama-ai/tfhe-rs'
|
||||
uses: ./.github/workflows/benchmark_gpu_integer_common.yml
|
||||
with:
|
||||
profile: single-h100
|
||||
hardware_name: n3-H100x1
|
||||
command: integer,integer_multi_bit
|
||||
op_flavor: default
|
||||
bench_type: latency
|
||||
all_precisions: true
|
||||
secrets: inherit
|
||||
|
||||
run-benchmarks-2-h100:
|
||||
name: Run benchmarks (2xH100)
|
||||
if: github.repository == 'zama-ai/tfhe-rs'
|
||||
uses: ./.github/workflows/benchmark_gpu_integer_common.yml
|
||||
with:
|
||||
profile: 2-h100
|
||||
hardware_name: n3-H100x2
|
||||
command: integer_multi_bit
|
||||
op_flavor: default
|
||||
bench_type: latency
|
||||
all_precisions: true
|
||||
secrets: inherit
|
||||
|
||||
run-benchmarks-8-h100:
|
||||
name: Run benchmarks (8xH100)
|
||||
if: github.repository == 'zama-ai/tfhe-rs'
|
||||
uses: ./.github/workflows/benchmark_gpu_integer_common.yml
|
||||
with:
|
||||
profile: multi-h100
|
||||
hardware_name: n3-H100x8
|
||||
command: integer_multi_bit
|
||||
op_flavor: default
|
||||
bench_type: latency
|
||||
all_precisions: true
|
||||
secrets: inherit
|
||||
|
||||
run-benchmarks-l40:
|
||||
name: Run benchmarks (L40)
|
||||
if: github.repository == 'zama-ai/tfhe-rs'
|
||||
uses: ./.github/workflows/benchmark_gpu_integer_common.yml
|
||||
with:
|
||||
profile: l40
|
||||
hardware_name: n3-L40x1
|
||||
command: integer_multi_bit,integer_compression,pbs,ks
|
||||
op_flavor: default
|
||||
bench_type: latency
|
||||
all_precisions: true
|
||||
secrets: inherit
|
||||
206
.github/workflows/benchmark_gpu_l40.yml
vendored
206
.github/workflows/benchmark_gpu_l40.yml
vendored
@@ -1,206 +0,0 @@
|
||||
# Run benchmarks on an L40 VM and return parsed results to Slab CI bot.
|
||||
name: Cuda benchmarks (L40)
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
- cron: '0 1 * * 6'
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-l40-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'schedule' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: l40
|
||||
|
||||
cuda-l40-benchmarks:
|
||||
name: Cuda benchmarks (L40)
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
continue-on-error: true
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: [integer_multi_bit]
|
||||
op_flavor: [default]
|
||||
# explicit include-based build matrix, of known valid options
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CUDA_PATH=$CUDA_PATH";
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
|
||||
|
||||
- name: Run compression benchmarks with AVX512
|
||||
run: |
|
||||
make bench_integer_compression_gpu
|
||||
|
||||
- name: Run PBS benchmarks
|
||||
run: |
|
||||
make bench_pbs_gpu
|
||||
|
||||
- name: Run KS benchmarks
|
||||
run: |
|
||||
make bench_ks_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "n3-L40x1" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-l40-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-l40-benchmarks.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cuda-l40-benchmarks.result }}
|
||||
SLACK_MESSAGE: "Cuda benchmarks (L40) finished with status: ${{ needs.cuda-l40-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-l40-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-l40-benchmarks, slack-notify ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-l40-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
61
.github/workflows/benchmark_integer.yml
vendored
61
.github/workflows/benchmark_integer.yml
vendored
@@ -8,6 +8,14 @@ on:
|
||||
description: "Run all precisions"
|
||||
type: boolean
|
||||
default: false
|
||||
bench_type:
|
||||
description: "Benchmarks type"
|
||||
type: choice
|
||||
default: latency
|
||||
options:
|
||||
- latency
|
||||
- throughput
|
||||
- both
|
||||
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
@@ -27,6 +35,7 @@ env:
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
FAST_BENCH: TRUE
|
||||
BENCH_TYPE: latency
|
||||
|
||||
jobs:
|
||||
prepare-matrix:
|
||||
@@ -36,10 +45,10 @@ jobs:
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
op_flavor: ${{ steps.set_op_flavor.outputs.op_flavor }}
|
||||
bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
|
||||
steps:
|
||||
- name: Weekly benchmarks
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
github.event.schedule == '0 1 * * 6'
|
||||
if: github.event.schedule == '0 1 * * 6'
|
||||
run: |
|
||||
echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
@@ -48,11 +57,31 @@ jobs:
|
||||
run: |
|
||||
echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set operation flavor output
|
||||
id: set_op_flavor
|
||||
run: |
|
||||
- name: Set benchmark types
|
||||
if: github.event_name == 'workflow_dispatch'
|
||||
run: |
|
||||
echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
|
||||
if [[ "${{ inputs.bench_type }}" == "both" ]]; then
|
||||
echo "BENCH_TYPE=[\"latency\", \"throughput\"]" >> "${GITHUB_ENV}"
|
||||
else
|
||||
echo "BENCH_TYPE=[\"${{ inputs.bench_type }}\"]" >> "${GITHUB_ENV}"
|
||||
fi
|
||||
|
||||
- name: Default benchmark type
|
||||
if: github.event_name != 'workflow_dispatch'
|
||||
run: |
|
||||
echo "BENCH_TYPE=[\"latency\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set operation flavor output
|
||||
id: set_op_flavor
|
||||
run: |
|
||||
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Set benchmark types output
|
||||
id: set_bench_type
|
||||
run: |
|
||||
echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (integer-benchmarks)
|
||||
needs: prepare-matrix
|
||||
@@ -62,7 +91,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -85,6 +114,7 @@ jobs:
|
||||
matrix:
|
||||
command: [ integer, integer_multi_bit]
|
||||
op_flavor: ${{ fromJson(needs.prepare-matrix.outputs.op_flavor) }}
|
||||
bench_type: ${{ fromJSON(needs.prepare-matrix.outputs.bench_type) }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
@@ -100,13 +130,8 @@ jobs:
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -124,13 +149,13 @@ jobs:
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} BENCH_TYPE=${{ matrix.bench_type }} bench_${{ matrix.command }}
|
||||
|
||||
# Run these benchmarks only once
|
||||
# Run these benchmarks only once per benchmark type
|
||||
- name: Run compression benchmarks with AVX512
|
||||
if: matrix.op_flavor == 'default' && matrix.command == 'integer'
|
||||
run: |
|
||||
make bench_integer_compression
|
||||
make BENCH_TYPE=${{ matrix.bench_type }} bench_integer_compression
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -143,12 +168,12 @@ jobs:
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
--bench-type ${{ env.BENCH_TYPE }}
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ matrix.bench_type }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
@@ -173,7 +198,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
14
.github/workflows/benchmark_shortint.yml
vendored
14
.github/workflows/benchmark_shortint.yml
vendored
@@ -56,7 +56,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -92,13 +92,8 @@ jobs:
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -125,8 +120,7 @@ jobs:
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
--name-suffix avx512
|
||||
|
||||
# This small benchmark needs to be executed only once.
|
||||
- name: Measure key sizes
|
||||
@@ -169,7 +163,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
52
.github/workflows/benchmark_signed_integer.yml
vendored
52
.github/workflows/benchmark_signed_integer.yml
vendored
@@ -8,6 +8,14 @@ on:
|
||||
description: "Run all precisions"
|
||||
type: boolean
|
||||
default: false
|
||||
bench_type:
|
||||
description: "Benchmarks type"
|
||||
type: choice
|
||||
default: latency
|
||||
options:
|
||||
- latency
|
||||
- throughput
|
||||
- both
|
||||
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
@@ -36,10 +44,10 @@ jobs:
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
op_flavor: ${{ steps.set_op_flavor.outputs.op_flavor }}
|
||||
bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
|
||||
steps:
|
||||
- name: Weekly benchmarks
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
github.event.schedule == '0 1 * * 6'
|
||||
if: github.event.schedule == '0 1 * * 6'
|
||||
run: |
|
||||
echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
@@ -48,11 +56,31 @@ jobs:
|
||||
run: |
|
||||
echo "OP_FLAVOR=[\"default\", \"unchecked\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set benchmark types
|
||||
if: github.event_name == 'workflow_dispatch'
|
||||
run: |
|
||||
echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
|
||||
if [[ "${{ inputs.bench_type }}" == "both" ]]; then
|
||||
echo "BENCH_TYPE=[\"latency\", \"throughput\"]" >> "${GITHUB_ENV}"
|
||||
else
|
||||
echo "BENCH_TYPE=[\"${{ inputs.bench_type }}\"]" >> "${GITHUB_ENV}"
|
||||
fi
|
||||
|
||||
- name: Default benchmark type
|
||||
if: github.event_name != 'workflow_dispatch'
|
||||
run: |
|
||||
echo "BENCH_TYPE=[\"latency\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set operation flavor output
|
||||
id: set_op_flavor
|
||||
run: |
|
||||
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Set benchmark types output
|
||||
id: set_bench_type
|
||||
run: |
|
||||
echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (signed-integer-benchmarks)
|
||||
needs: prepare-matrix
|
||||
@@ -62,7 +90,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -84,7 +112,8 @@ jobs:
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: [ integer, integer_multi_bit ]
|
||||
op_flavor: [ default, unchecked ]
|
||||
op_flavor: ${{ fromJSON(needs.prepare-matrix.outputs.op_flavor) }}
|
||||
bench_type: ${{ fromJSON(needs.prepare-matrix.outputs.bench_type) }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
@@ -100,13 +129,8 @@ jobs:
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -124,7 +148,7 @@ jobs:
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_signed_${{ matrix.command }}
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} BENCH_TYPE=${{ matrix.bench_type }} bench_signed_${{ matrix.command }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -137,12 +161,12 @@ jobs:
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
--bench-type ${{ matrix.bench_type }}
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ matrix.bench_type }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
@@ -167,7 +191,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
140
.github/workflows/benchmark_tfhe_fft.yml
vendored
Normal file
140
.github/workflows/benchmark_tfhe_fft.yml
vendored
Normal file
@@ -0,0 +1,140 @@
|
||||
# Run FFT benchmarks on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: FFT benchmarks
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
RUST_BACKTRACE: "full"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- "main"
|
||||
schedule:
|
||||
# Job will be triggered each Thursday at 11p.m.
|
||||
- cron: '0 23 * * 4'
|
||||
|
||||
jobs:
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (fft-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
profile: bench
|
||||
|
||||
fft-benchmarks:
|
||||
name: Execute FFT benchmarks in EC2
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make bench_fft
|
||||
|
||||
- name: Parse AVX512 results
|
||||
run: |
|
||||
python3 ./ci/fft_benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database concrete_fft \
|
||||
--hardware "hpc7a.96xlarge" \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--name-suffix avx512
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
|
||||
with:
|
||||
name: ${{ github.sha }}_fft
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on downloaded artifact"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "tfhe-fft benchmarks failed. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (fft-benchmarks)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, fft-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "EC2 teardown (fft-benchmarks) failed. (${{ env.ACTION_RUN_URL }})"
|
||||
140
.github/workflows/benchmark_tfhe_ntt.yml
vendored
Normal file
140
.github/workflows/benchmark_tfhe_ntt.yml
vendored
Normal file
@@ -0,0 +1,140 @@
|
||||
# Run NTT benchmarks on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: NTT benchmarks
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
RUST_BACKTRACE: "full"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- "main"
|
||||
schedule:
|
||||
# Job will be triggered each Friday at 11p.m.
|
||||
- cron: "0 23 * * 5"
|
||||
|
||||
jobs:
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (ntt-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
profile: bench
|
||||
|
||||
ntt-benchmarks:
|
||||
name: Execute NTT benchmarks in EC2
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
|
||||
- name: Run benchmarks
|
||||
run: |
|
||||
make bench_ntt
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/ntt_benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database concrete_ntt \
|
||||
--hardware "hpc7a.96xlarge" \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--name-suffix avx512
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
|
||||
with:
|
||||
name: ${{ github.sha }}_ntt
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on downloaded artifact"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "tfhe-ntt benchmarks failed. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (ntt-benchmarks)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [setup-ec2, ntt-benchmarks]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "EC2 teardown (ntt-benchmarks) failed. (${{ env.ACTION_RUN_URL }})"
|
||||
173
.github/workflows/benchmark_tfhe_zk_pok.yml
vendored
Normal file
173
.github/workflows/benchmark_tfhe_zk_pok.yml
vendored
Normal file
@@ -0,0 +1,173 @@
|
||||
# Run benchmarks of the tfhe-zk-pok crate on an instance and return parsed results to Slab CI bot.
|
||||
name: tfhe-zk-pok benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 3a.m.
|
||||
- cron: '0 3 * * 6'
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
((github.event_name == 'push' || github.event_name == 'schedule') && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
zk_pok_changed: ${{ steps.changed-files.outputs.zk_pok_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
zk_pok:
|
||||
- tfhe-zk-pok/**
|
||||
- .github/workflows/benchmark_tfhe_zk_pok.yml
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (tfhe-zk-pok-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
needs: should-run
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'push' &&
|
||||
github.repository == 'zama-ai/tfhe-rs' &&
|
||||
needs.should-run.outputs.zk_pok_changed == 'true')
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
profile: bench
|
||||
|
||||
tfhe-zk-pok-benchmarks:
|
||||
name: Execute tfhe-zk-pok benchmarks
|
||||
if: needs.setup-instance.result != 'skipped'
|
||||
needs: setup-instance
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Run benchmarks
|
||||
run: |
|
||||
make bench_tfhe_zk_pok
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--crate tfhe-zk-pok \
|
||||
--hardware "hpc7a.96xlarge" \
|
||||
--backend cpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
|
||||
with:
|
||||
name: ${{ github.sha }}_tfhe_zk_pok
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "tfhe-zk-pok benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (tfhe-zk-pok-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, tfhe-zk-pok-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (tfhe-zk-pok-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
10
.github/workflows/benchmark_wasm_client.yml
vendored
10
.github/workflows/benchmark_wasm_client.yml
vendored
@@ -39,13 +39,13 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
|
||||
uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
wasm_bench:
|
||||
- tfhe/Cargo.toml
|
||||
- concrete-csprng/**
|
||||
- tfhe-csprng/**
|
||||
- tfhe-zk-pok/**
|
||||
- tfhe/src/**
|
||||
- '!tfhe/src/c_api/**'
|
||||
@@ -64,7 +64,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -98,7 +98,7 @@ jobs:
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -199,7 +199,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
30
.github/workflows/benchmark_zk_pke.yml
vendored
30
.github/workflows/benchmark_zk_pke.yml
vendored
@@ -3,6 +3,12 @@ name: PKE ZK benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
run_throughput:
|
||||
description: "Run throughput benchmarks"
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
@@ -20,6 +26,7 @@ env:
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
BENCH_TYPE: latency
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
@@ -36,13 +43,14 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
|
||||
uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
zk_pok:
|
||||
- tfhe/Cargo.toml
|
||||
- concrete-csprng/**
|
||||
- tfhe-csprng/**
|
||||
- tfhe-fft/**
|
||||
- tfhe-zk-pok/**
|
||||
- tfhe/src/core_crypto/**
|
||||
- tfhe/src/shortint/**
|
||||
@@ -65,7 +73,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -97,13 +105,8 @@ jobs:
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -114,6 +117,11 @@ jobs:
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Should run throughput benchmarks
|
||||
if: inputs.run_throughput
|
||||
run: |
|
||||
echo "BENCH_TYPE=throughput" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make bench_integer_zk
|
||||
@@ -130,7 +138,7 @@ jobs:
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
--bench-type ${{ env.BENCH_TYPE }}
|
||||
|
||||
- name: Parse CRS sizes results
|
||||
run: |
|
||||
@@ -173,7 +181,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
8
.github/workflows/cargo_build.yml
vendored
8
.github/workflows/cargo_build.yml
vendored
@@ -1,4 +1,4 @@
|
||||
name: Cargo Build
|
||||
name: Cargo Build TFHE-rs
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
@@ -28,7 +28,7 @@ jobs:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -47,10 +47,10 @@ jobs:
|
||||
run: |
|
||||
make pcc
|
||||
|
||||
- name: Build concrete-csprng
|
||||
- name: Build tfhe-csprng
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_concrete_csprng
|
||||
make build_tfhe_csprng
|
||||
|
||||
- name: Build Release core
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
|
||||
44
.github/workflows/cargo_build_tfhe_fft.yml
vendored
Normal file
44
.github/workflows/cargo_build_tfhe_fft.yml
vendored
Normal file
@@ -0,0 +1,44 @@
|
||||
# Build tfhe-fft
|
||||
name: Cargo Build tfhe-fft
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
cargo-builds-fft:
|
||||
runs-on: ${{ matrix.runner_type }}
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
runner_type: [ubuntu-latest, macos-latest, windows-latest]
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
|
||||
|
||||
- name: Install Rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
with:
|
||||
toolchain: stable
|
||||
override: true
|
||||
|
||||
- name: Run pcc checks
|
||||
if: matrix.runner_type == 'ubuntu-latest'
|
||||
run: |
|
||||
sudo apt install -y libfftw3-dev
|
||||
make pcc_fft
|
||||
|
||||
- name: Build release
|
||||
run: |
|
||||
make build_fft
|
||||
|
||||
- name: Build release no-std
|
||||
run: |
|
||||
make build_fft_no_std
|
||||
40
.github/workflows/cargo_build_tfhe_ntt.yml
vendored
Normal file
40
.github/workflows/cargo_build_tfhe_ntt.yml
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
# Build tfhe-ntt
|
||||
name: Cargo Build tfhe-ntt
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
cargo-builds:
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest, windows-latest]
|
||||
fail-fast: false
|
||||
steps:
|
||||
- uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
|
||||
|
||||
- name: Install Rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
with:
|
||||
toolchain: stable
|
||||
override: true
|
||||
|
||||
- name: Run pcc checks
|
||||
run: |
|
||||
make pcc_ntt
|
||||
|
||||
- name: Build release
|
||||
run: |
|
||||
make build_ntt
|
||||
|
||||
- name: Build release no-std
|
||||
run: |
|
||||
make build_ntt_no_std
|
||||
71
.github/workflows/cargo_test_fft.yml
vendored
Normal file
71
.github/workflows/cargo_test_fft.yml
vendored
Normal file
@@ -0,0 +1,71 @@
|
||||
# Test tfhe-fft
|
||||
name: Cargo Test tfhe-fft
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
cargo-tests:
|
||||
runs-on: ${{ matrix.runner_type }}
|
||||
strategy:
|
||||
matrix:
|
||||
runner_type: [ubuntu-latest, macos-latest, windows-latest]
|
||||
fail-fast: false
|
||||
steps:
|
||||
- uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
|
||||
|
||||
- name: Install Rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
with:
|
||||
toolchain: stable
|
||||
override: true
|
||||
|
||||
- name: Test debug
|
||||
run: |
|
||||
make test_fft
|
||||
|
||||
- name: Test serialization
|
||||
run: make test_fft_serde
|
||||
|
||||
- name: Test no-std
|
||||
run: |
|
||||
make test_fft_no_std
|
||||
|
||||
cargo-tests-nightly:
|
||||
runs-on: ${{ matrix.runner_type }}
|
||||
strategy:
|
||||
matrix:
|
||||
runner_type: [ubuntu-latest, macos-latest, windows-latest]
|
||||
steps:
|
||||
- uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
|
||||
|
||||
- name: Install Rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
|
||||
- name: Test nightly
|
||||
run: |
|
||||
make test_fft_nightly
|
||||
|
||||
- name: Test no-std nightly
|
||||
run: |
|
||||
make test_fft_no_std_nightly
|
||||
|
||||
cargo-tests-node-js:
|
||||
runs-on: "ubuntu-latest"
|
||||
steps:
|
||||
- uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
|
||||
|
||||
- name: Test node js
|
||||
run: |
|
||||
make install_node
|
||||
make test_fft_node_js_ci
|
||||
54
.github/workflows/cargo_test_ntt.yml
vendored
Normal file
54
.github/workflows/cargo_test_ntt.yml
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
# Test tfhe-ntt
|
||||
name: Cargo Test tfhe-ntt
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
cargo-tests:
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest, windows-latest]
|
||||
fail-fast: false
|
||||
steps:
|
||||
- uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
|
||||
|
||||
- name: Install Rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
with:
|
||||
toolchain: stable
|
||||
override: true
|
||||
|
||||
- name: Test debug
|
||||
run: make test_ntt
|
||||
|
||||
- name: Test no-std
|
||||
run: make test_ntt_no_std
|
||||
|
||||
cargo-tests-nightly:
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest, windows-latest]
|
||||
steps:
|
||||
- uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c
|
||||
|
||||
- name: Install Rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
|
||||
- name: Test nightly
|
||||
run: make test_ntt_nightly
|
||||
|
||||
- name: Test no-std nightly
|
||||
run: make test_ntt_no_std_nightly
|
||||
2
.github/workflows/ci_lint.yml
vendored
2
.github/workflows/ci_lint.yml
vendored
@@ -27,7 +27,7 @@ jobs:
|
||||
make lint_workflow
|
||||
|
||||
- name: Ensure SHA pinned actions
|
||||
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@ed00f72a3ca5b6eff8ad4d3ffdcacedb67a21db1 # v3.0.15
|
||||
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@5d6ac37a4cef8b8df67f482a8e384987766f0213 # v3.0.17
|
||||
with:
|
||||
allowlist: |
|
||||
slsa-framework/slsa-github-generator
|
||||
|
||||
16
.github/workflows/code_coverage.yml
vendored
16
.github/workflows/code_coverage.yml
vendored
@@ -25,7 +25,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -47,19 +47,19 @@ jobs:
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
|
||||
uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
|
||||
with:
|
||||
files_yaml: |
|
||||
tfhe:
|
||||
- tfhe/src/**
|
||||
concrete_csprng:
|
||||
- concrete-csprng/src/**
|
||||
tfhe_csprng:
|
||||
- tfhe-csprng/src/**
|
||||
|
||||
- name: Generate Keys
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
@@ -83,7 +83,7 @@ jobs:
|
||||
make test_shortint_cov
|
||||
|
||||
- name: Upload tfhe coverage to Codecov
|
||||
uses: codecov/codecov-action@b9fd7d16f6d7d1b5d2bec1a2887e65ceed900238
|
||||
uses: codecov/codecov-action@5c47607acb93fed5485fdbf7232e8a31425f672a
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
@@ -97,7 +97,7 @@ jobs:
|
||||
make test_integer_cov
|
||||
|
||||
- name: Upload tfhe coverage to Codecov
|
||||
uses: codecov/codecov-action@b9fd7d16f6d7d1b5d2bec1a2887e65ceed900238
|
||||
uses: codecov/codecov-action@5c47607acb93fed5485fdbf7232e8a31425f672a
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
@@ -121,7 +121,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -27,7 +27,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -51,7 +51,7 @@ jobs:
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -65,7 +65,7 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "tfhe-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (csprng-randomness-tests)
|
||||
@@ -75,7 +75,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
2
.github/workflows/gpu_4090_tests.yml
vendored
2
.github/workflows/gpu_4090_tests.yml
vendored
@@ -40,7 +40,7 @@ jobs:
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
11
.github/workflows/gpu_fast_h100_tests.yml
vendored
11
.github/workflows/gpu_fast_h100_tests.yml
vendored
@@ -34,7 +34,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
|
||||
uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -44,6 +44,9 @@ jobs:
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_signed/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
@@ -64,7 +67,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -117,7 +120,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -183,7 +186,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
11
.github/workflows/gpu_fast_tests.yml
vendored
11
.github/workflows/gpu_fast_tests.yml
vendored
@@ -33,7 +33,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
|
||||
uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -42,6 +42,9 @@ jobs:
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_signed/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
@@ -62,7 +65,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -115,7 +118,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -181,7 +184,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
6
.github/workflows/gpu_full_h100_tests.yml
vendored
6
.github/workflows/gpu_full_h100_tests.yml
vendored
@@ -25,7 +25,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -76,7 +76,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -139,7 +139,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
11
.github/workflows/gpu_full_multi_gpu_tests.yml
vendored
11
.github/workflows/gpu_full_multi_gpu_tests.yml
vendored
@@ -34,7 +34,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
|
||||
uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -43,6 +43,9 @@ jobs:
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_signed/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
@@ -64,7 +67,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -117,7 +120,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -186,7 +189,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
6
.github/workflows/gpu_pcc.yml
vendored
6
.github/workflows/gpu_pcc.yml
vendored
@@ -24,7 +24,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -110,7 +110,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
188
.github/workflows/gpu_signed_integer_classic_tests.yml
vendored
Normal file
188
.github/workflows/gpu_signed_integer_classic_tests.yml
vendored
Normal file
@@ -0,0 +1,188 @@
|
||||
# Signed integer GPU tests on an RTXA6000 VM on hyperstack with classical PBS
|
||||
name: TFHE Cuda Backend - Signed integer tests with classical PBS
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
gpu:
|
||||
- tfhe/Cargo.toml
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_signed/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**.md'
|
||||
- '.github/workflows/gpu_signed_integer_classic_tests.yml'
|
||||
- scripts/integer-tests.sh
|
||||
- ci/slab.toml
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-signed-classic-tests)
|
||||
needs: should-run
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: gpu-test
|
||||
|
||||
cuda-tests-linux:
|
||||
name: CUDA signed integer tests with classical PBS
|
||||
needs: [ should-run, setup-instance ]
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run signed integer tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_signed_integer_gpu_ci
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
|
||||
SLACK_MESSAGE: "Integer GPU signed integer tests with classical PBS finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-signed-classic-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-signed-classic-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
@@ -35,7 +35,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
|
||||
uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -44,6 +44,9 @@ jobs:
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_signed/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
@@ -65,7 +68,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -116,7 +119,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -143,10 +146,6 @@ jobs:
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run signed integer tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_signed_integer_gpu_ci
|
||||
|
||||
- name: Run signed integer multi-bit tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_signed_integer_multi_bit_gpu_ci
|
||||
@@ -172,7 +171,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
11
.github/workflows/gpu_signed_integer_tests.yml
vendored
11
.github/workflows/gpu_signed_integer_tests.yml
vendored
@@ -41,7 +41,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
|
||||
uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -50,6 +50,9 @@ jobs:
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_signed/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
@@ -71,7 +74,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -125,7 +128,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -185,7 +188,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
188
.github/workflows/gpu_unsigned_integer_classic_tests.yml
vendored
Normal file
188
.github/workflows/gpu_unsigned_integer_classic_tests.yml
vendored
Normal file
@@ -0,0 +1,188 @@
|
||||
# Test unsigned integers on an RTXA6000 VM on hyperstack with the classical PBS
|
||||
name: TFHE Cuda Backend - Unsigned integer tests with classical PBS
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
gpu:
|
||||
- tfhe/Cargo.toml
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_signed/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**.md'
|
||||
- '.github/workflows/gpu_unsigned_integer_classic_tests.yml'
|
||||
- scripts/integer-tests.sh
|
||||
- ci/slab.toml
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-unsigned-classic-tests)
|
||||
needs: should-run
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: gpu-test
|
||||
|
||||
cuda-tests-linux:
|
||||
name: CUDA unsigned integer tests with classical PBS
|
||||
needs: [ should-run, setup-instance ]
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run unsigned integer tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_gpu_ci
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
|
||||
SLACK_MESSAGE: "Unsigned integer GPU classic tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-unsigned-classic-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-unsigned-classic-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
@@ -35,7 +35,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
|
||||
uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -44,12 +44,15 @@ jobs:
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_signed/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**.md'
|
||||
- '.github/workflows/gpu_unsigned_integer_tests.yml'
|
||||
- '.github/workflows/gpu_unsigned_integer_h100_tests.yml'
|
||||
- scripts/integer-tests.sh
|
||||
- ci/slab.toml
|
||||
|
||||
@@ -65,7 +68,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -116,7 +119,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -143,10 +146,6 @@ jobs:
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run unsigned integer tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_gpu_ci
|
||||
|
||||
- name: Run unsigned integer multi-bit tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_multi_bit_gpu_ci
|
||||
@@ -172,7 +171,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
11
.github/workflows/gpu_unsigned_integer_tests.yml
vendored
11
.github/workflows/gpu_unsigned_integer_tests.yml
vendored
@@ -41,7 +41,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
|
||||
uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -50,6 +50,9 @@ jobs:
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_signed/**
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
@@ -71,7 +74,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -122,7 +125,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -182,7 +185,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
61
.github/workflows/m1_tests.yml
vendored
61
.github/workflows/m1_tests.yml
vendored
@@ -3,7 +3,7 @@ name: Tests on M1 CPU
|
||||
on:
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
types: [labeled]
|
||||
# Have a nightly build for M1 tests
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
@@ -27,7 +27,7 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
cargo-builds:
|
||||
cargo-builds-m1:
|
||||
if: ${{ (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'm1_test') }}
|
||||
runs-on: ["self-hosted", "m1mac"]
|
||||
# 12 hours, default is 6 hours, hopefully this is more than enough
|
||||
@@ -36,20 +36,57 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
persist-credentials: "false"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Run pcc FFT checks
|
||||
run: |
|
||||
make pcc_fft
|
||||
|
||||
- name: Build FFT release
|
||||
run: |
|
||||
make build_fft
|
||||
|
||||
- name: Build FFT release no-std
|
||||
run: |
|
||||
make build_fft_no_std
|
||||
|
||||
- name: Run FFT tests
|
||||
run: |
|
||||
make test_fft
|
||||
make test_fft_serde
|
||||
make test_fft_nightly
|
||||
make test_fft_no_std
|
||||
make test_fft_no_std_nightly
|
||||
# we don't run the js stuff here as it's causing issues with the M1 config
|
||||
|
||||
- name: Run pcc NTT checks
|
||||
run: |
|
||||
make pcc_ntt
|
||||
|
||||
- name: Build NTT release
|
||||
run: |
|
||||
make build_ntt
|
||||
|
||||
- name: Build NTT release no-std
|
||||
run: |
|
||||
make build_ntt_no_std
|
||||
|
||||
- name: Run NTT tests
|
||||
run: |
|
||||
make test_ntt_all
|
||||
|
||||
- name: Run pcc checks
|
||||
run: |
|
||||
make pcc
|
||||
|
||||
- name: Build concrete-csprng
|
||||
- name: Build tfhe-csprng
|
||||
run: |
|
||||
make build_concrete_csprng
|
||||
make build_tfhe_csprng
|
||||
|
||||
- name: Build Release core
|
||||
run: |
|
||||
@@ -75,9 +112,9 @@ jobs:
|
||||
run: |
|
||||
make build_c_api
|
||||
|
||||
- name: Run concrete-csprng tests
|
||||
- name: Run tfhe-csprng tests
|
||||
run: |
|
||||
make test_concrete_csprng
|
||||
make test_tfhe_csprng
|
||||
|
||||
- name: Run tfhe-zk-pok tests
|
||||
run: |
|
||||
@@ -137,7 +174,7 @@ jobs:
|
||||
name: Remove m1_test label
|
||||
runs-on: ubuntu-latest
|
||||
needs:
|
||||
- cargo-builds
|
||||
- cargo-builds-m1
|
||||
if: ${{ always() }}
|
||||
steps:
|
||||
- uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
|
||||
@@ -147,13 +184,13 @@ jobs:
|
||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ needs.cargo-builds.result != 'skipped' }}
|
||||
if: ${{ needs.cargo-builds-m1.result != 'skipped' }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cargo-builds.result }}
|
||||
SLACK_COLOR: ${{ needs.cargo-builds-m1.result }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "M1 tests finished with status: ${{ needs.cargo-builds.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "M1 tests finished with status: ${{ needs.cargo-builds-m1.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
name: Publish concrete-csprng release
|
||||
name: Publish tfhe-csprng release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
@@ -19,7 +19,7 @@ jobs:
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
publish_release:
|
||||
name: Publish concrete-csprng Release
|
||||
name: Publish tfhe-csprng Release
|
||||
needs: verify_tag
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
@@ -33,7 +33,7 @@ jobs:
|
||||
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
|
||||
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
|
||||
run: |
|
||||
cargo publish -p concrete-csprng --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
|
||||
cargo publish -p tfhe-csprng --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -43,6 +43,6 @@ jobs:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "concrete-csprng release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "tfhe-csprng release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
6
.github/workflows/make_release_cuda.yml
vendored
6
.github/workflows/make_release_cuda.yml
vendored
@@ -36,7 +36,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -70,7 +70,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -119,7 +119,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
|
||||
uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
49
.github/workflows/make_release_tfhe_fft.yml
vendored
Normal file
49
.github/workflows/make_release_tfhe_fft.yml
vendored
Normal file
@@ -0,0 +1,49 @@
|
||||
# Publish new release of tfhe-fft
|
||||
name: Publish tfhe-fft release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
dry_run:
|
||||
description: "Dry-run"
|
||||
type: boolean
|
||||
default: true
|
||||
|
||||
env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
|
||||
jobs:
|
||||
verify_tag:
|
||||
uses: ./.github/workflows/verify_tagged_commit.yml
|
||||
secrets:
|
||||
RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
publish_release:
|
||||
name: Publish tfhe-fft Release
|
||||
runs-on: ubuntu-latest
|
||||
needs: verify_tag
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Publish crate.io package
|
||||
env:
|
||||
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
|
||||
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
|
||||
run: |
|
||||
cargo publish -p tfhe-fft --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "tfhe-fft release failed: (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
49
.github/workflows/make_release_tfhe_ntt.yml
vendored
Normal file
49
.github/workflows/make_release_tfhe_ntt.yml
vendored
Normal file
@@ -0,0 +1,49 @@
|
||||
# Publish new release of tfhe-ntt
|
||||
name: Publish tfhe-ntt release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
dry_run:
|
||||
description: "Dry-run"
|
||||
type: boolean
|
||||
default: true
|
||||
|
||||
env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
|
||||
jobs:
|
||||
verify_tag:
|
||||
uses: ./.github/workflows/verify_tagged_commit.yml
|
||||
secrets:
|
||||
RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
publish_release:
|
||||
name: Publish tfhe-ntt Release
|
||||
runs-on: ubuntu-latest
|
||||
needs: verify_tag
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Publish crate.io package
|
||||
env:
|
||||
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
|
||||
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
|
||||
run: |
|
||||
cargo publish -p tfhe-ntt --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "tfhe-ntt release failed: (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -13,6 +13,7 @@ target/
|
||||
|
||||
# Some of our bench outputs
|
||||
/tfhe/benchmarks_parameters
|
||||
/tfhe-zk-pok/benchmarks_parameters
|
||||
**/*.csv
|
||||
|
||||
# dieharder run log
|
||||
@@ -28,6 +29,8 @@ backends/tfhe-cuda-backend/cuda/cmake-build-debug/
|
||||
tfhe/web_wasm_parallel_tests/server.PID
|
||||
venv/
|
||||
web-test-runner/
|
||||
node_modules/
|
||||
package-lock.json
|
||||
|
||||
# Dir used for backward compatibility test data
|
||||
tfhe/tfhe-backward-compat-data/
|
||||
|
||||
12
Cargo.toml
12
Cargo.toml
@@ -2,10 +2,12 @@
|
||||
resolver = "2"
|
||||
members = [
|
||||
"tfhe",
|
||||
"tfhe-fft",
|
||||
"tfhe-ntt",
|
||||
"tfhe-zk-pok",
|
||||
"tasks",
|
||||
"apps/trivium",
|
||||
"concrete-csprng",
|
||||
"tfhe-csprng",
|
||||
"backends/tfhe-cuda-backend",
|
||||
"utils/tfhe-versionable",
|
||||
"utils/tfhe-versionable-derive",
|
||||
@@ -16,6 +18,14 @@ exclude = [
|
||||
"utils/cargo-tfhe-lints-inner",
|
||||
"utils/cargo-tfhe-lints"
|
||||
]
|
||||
[workspace.dependencies]
|
||||
aligned-vec = { version = "0.5", default-features = false }
|
||||
bytemuck = "1.14.3"
|
||||
dyn-stack = { version = "0.10", default-features = false }
|
||||
num-complex = "0.4"
|
||||
pulp = { version = "0.18.22", default-features = false }
|
||||
serde = { version = "1.0", default-features = false }
|
||||
wasm-bindgen = ">=0.2.86,<0.2.94"
|
||||
|
||||
[profile.bench]
|
||||
lto = "fat"
|
||||
|
||||
319
Makefile
319
Makefile
@@ -18,13 +18,15 @@ FAST_TESTS?=FALSE
|
||||
FAST_BENCH?=FALSE
|
||||
NIGHTLY_TESTS?=FALSE
|
||||
BENCH_OP_FLAVOR?=DEFAULT
|
||||
BENCH_TYPE?=latency
|
||||
NODE_VERSION=22.6
|
||||
FORWARD_COMPAT?=OFF
|
||||
BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
|
||||
BACKWARD_COMPAT_DATA_BRANCH?=v0.3
|
||||
BACKWARD_COMPAT_DATA_BRANCH?=v0.4
|
||||
BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
|
||||
BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
|
||||
TFHE_SPEC:=tfhe
|
||||
# We are kind of hacking the cut here, the version cannot contain a quote '"'
|
||||
WASM_BINDGEN_VERSION:=$(shell grep '^wasm-bindgen[[:space:]]*=' Cargo.toml | cut -d '"' -f 2 | xargs)
|
||||
WEB_RUNNER_DIR=web-test-runner
|
||||
WEB_SERVER_DIR=tfhe/web_wasm_parallel_tests
|
||||
# This is done to avoid forgetting it, we still precise the RUSTFLAGS in the commands to be able to
|
||||
@@ -43,12 +45,6 @@ else
|
||||
COVERAGE_ONLY=
|
||||
endif
|
||||
|
||||
ifeq ($(FORWARD_COMPAT),ON)
|
||||
FORWARD_COMPAT_FEATURE=forward_compatibility
|
||||
else
|
||||
FORWARD_COMPAT_FEATURE=
|
||||
endif
|
||||
|
||||
# Variables used only for regex_engine example
|
||||
REGEX_STRING?=''
|
||||
REGEX_PATTERN?=''
|
||||
@@ -98,12 +94,26 @@ install_rs_build_toolchain:
|
||||
( echo "Unable to install $(RS_BUILD_TOOLCHAIN) toolchain, check your rustup installation. \
|
||||
Rustup can be downloaded at https://rustup.rs/" && exit 1 )
|
||||
|
||||
.PHONY: install_build_wasm32_target # Install the wasm32 toolchain used for builds
|
||||
install_build_wasm32_target: install_rs_build_toolchain
|
||||
rustup +$(RS_BUILD_TOOLCHAIN) target add wasm32-unknown-unknown || \
|
||||
( echo "Unable to install wasm32-unknown-unknown target toolchain, check your rustup installation. \
|
||||
Rustup can be downloaded at https://rustup.rs/" && exit 1 )
|
||||
|
||||
.PHONY: install_cargo_nextest # Install cargo nextest used for shortint tests
|
||||
install_cargo_nextest: install_rs_build_toolchain
|
||||
@cargo nextest --version > /dev/null 2>&1 || \
|
||||
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install cargo-nextest --locked || \
|
||||
( echo "Unable to install cargo nextest, unknown error." && exit 1 )
|
||||
|
||||
# The installation should use the ^ symbol if the specified version in the root Cargo.toml is of the
|
||||
# form "0.2.96" then we get ^0.2.96 e.g., as we don't lock those dependencies
|
||||
# this allows to get the matching CLI
|
||||
# If a version range is specified no need to add the leading ^
|
||||
.PHONY: install_wasm_bindgen_cli # Install wasm-bindgen-cli to get access to the test runner
|
||||
install_wasm_bindgen_cli: install_rs_build_toolchain
|
||||
cargo +$(RS_BUILD_TOOLCHAIN) install --locked wasm-bindgen-cli --version "$(WASM_BINDGEN_VERSION)"
|
||||
|
||||
.PHONY: install_wasm_pack # Install wasm-pack to build JS packages
|
||||
install_wasm_pack: install_rs_build_toolchain
|
||||
@wasm-pack --version > /dev/null 2>&1 || \
|
||||
@@ -308,6 +318,9 @@ clippy_core: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,nightly-avx512 \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=$(TARGET_ARCH_FEATURE),zk-pok \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_boolean # Run clippy lints enabling the boolean features
|
||||
clippy_boolean: install_rs_check_toolchain
|
||||
@@ -323,6 +336,9 @@ clippy_shortint: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,experimental \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=$(TARGET_ARCH_FEATURE),zk-pok,shortint \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_integer # Run clippy lints enabling the integer features
|
||||
clippy_integer: install_rs_check_toolchain
|
||||
@@ -347,7 +363,7 @@ clippy_rustdoc: install_rs_check_toolchain
|
||||
fi && \
|
||||
CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --nocapture --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,zk-pok,pbs-stats \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,zk-pok,pbs-stats,strings \
|
||||
-p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
|
||||
@@ -378,17 +394,17 @@ clippy_trivium: install_rs_check_toolchain
|
||||
.PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
|
||||
clippy_all_targets: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok,strings \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok,experimental \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok,strings,experimental \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
|
||||
clippy_concrete_csprng: install_rs_check_toolchain
|
||||
.PHONY: clippy_tfhe_csprng # Run clippy lints on tfhe-csprng
|
||||
clippy_tfhe_csprng: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
--features=$(TARGET_ARCH_FEATURE) \
|
||||
-p concrete-csprng -- --no-deps -D warnings
|
||||
-p tfhe-csprng -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_zk_pok # Run clippy lints on tfhe-zk-pok
|
||||
clippy_zk_pok: install_rs_check_toolchain
|
||||
@@ -404,12 +420,12 @@ clippy_versionable: install_rs_check_toolchain
|
||||
|
||||
.PHONY: clippy_all # Run all clippy targets
|
||||
clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
|
||||
clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium \
|
||||
clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_trivium \
|
||||
clippy_versionable
|
||||
|
||||
.PHONY: clippy_fast # Run main clippy targets
|
||||
clippy_fast: clippy_rustdoc clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks \
|
||||
clippy_core clippy_concrete_csprng
|
||||
clippy_core clippy_tfhe_csprng
|
||||
|
||||
.PHONY: clippy_cuda_backend # Run clippy lints on the tfhe-cuda-backend
|
||||
clippy_cuda_backend: install_rs_check_toolchain
|
||||
@@ -475,7 +491,7 @@ build_tfhe_coverage: install_rs_build_toolchain
|
||||
.PHONY: build_c_api # Build the C API for boolean, shortint and integer
|
||||
build_c_api: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,$(FORWARD_COMPAT_FEATURE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok \
|
||||
-p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
|
||||
@@ -487,7 +503,7 @@ build_c_api_gpu: install_rs_check_toolchain
|
||||
.PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
|
||||
build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,experimental-force_fft_algo_dif4 \
|
||||
-p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: build_web_js_api # Build the js API targeting the web browser
|
||||
@@ -515,10 +531,10 @@ build_node_js_api: install_rs_build_toolchain install_wasm_pack
|
||||
wasm-pack build --release --target=nodejs \
|
||||
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok
|
||||
|
||||
.PHONY: build_concrete_csprng # Build concrete_csprng
|
||||
build_concrete_csprng: install_rs_build_toolchain
|
||||
.PHONY: build_tfhe_csprng # Build tfhe_csprng
|
||||
build_tfhe_csprng: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng --all-targets
|
||||
--features=$(TARGET_ARCH_FEATURE) -p tfhe-csprng --all-targets
|
||||
|
||||
.PHONY: test_core_crypto # Run the tests of the core_crypto module including experimental ones
|
||||
test_core_crypto: install_rs_build_toolchain install_rs_check_toolchain
|
||||
@@ -749,10 +765,15 @@ test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nexte
|
||||
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
|
||||
--signed-only --tfhe-package "$(TFHE_SPEC)"
|
||||
|
||||
.PHONY: test_safe_deserialization # Run the tests for safe deserialization
|
||||
test_safe_deserialization: install_rs_build_toolchain install_cargo_nextest
|
||||
.PHONY: test_safe_serialization # Run the tests for safe serialization
|
||||
test_safe_serialization: install_rs_build_toolchain install_cargo_nextest
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) -- safe_deserialization::
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) -- safe_serialization::
|
||||
|
||||
.PHONY: test_zk # Run the tests for the zk module of the TFHE-rs crate
|
||||
test_zk: install_rs_build_toolchain install_cargo_nextest
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,zk-pok -p $(TFHE_SPEC) -- zk::
|
||||
|
||||
.PHONY: test_integer # Run all the tests for integer
|
||||
test_integer: install_rs_build_toolchain
|
||||
@@ -779,6 +800,13 @@ test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) \
|
||||
-E "test(/high_level_api::.*gpu.*/)"
|
||||
|
||||
.PHONY: test_strings # Run the tests for strings ci
|
||||
test_strings: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,integer,strings -p $(TFHE_SPEC) \
|
||||
-- strings::
|
||||
|
||||
|
||||
.PHONY: test_user_doc # Run tests from the .md documentation
|
||||
test_user_doc: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
|
||||
@@ -792,11 +820,7 @@ test_user_doc_gpu: install_rs_build_toolchain
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu,zk-pok -p $(TFHE_SPEC) \
|
||||
-- test_user_docs::
|
||||
|
||||
.PHONY: test_fhe_strings # Run tests for fhe_strings example
|
||||
test_fhe_strings: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--example fhe_strings \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer
|
||||
|
||||
|
||||
.PHONY: test_regex_engine # Run tests for regex_engine example
|
||||
test_regex_engine: install_rs_build_toolchain
|
||||
@@ -823,16 +847,29 @@ test_kreyvium: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
-p tfhe-trivium -- --test-threads=1 kreyvium::
|
||||
|
||||
.PHONY: test_concrete_csprng # Run concrete-csprng tests
|
||||
test_concrete_csprng: install_rs_build_toolchain
|
||||
.PHONY: test_tfhe_csprng # Run tfhe-csprng tests
|
||||
test_tfhe_csprng: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng
|
||||
--features=$(TARGET_ARCH_FEATURE) -p tfhe-csprng
|
||||
|
||||
.PHONY: test_zk_pok # Run tfhe-zk-pok tests
|
||||
test_zk_pok: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
-p tfhe-zk-pok
|
||||
|
||||
.PHONY: test_zk_wasm_x86_compat_ci
|
||||
test_zk_wasm_x86_compat_ci: check_nvm_installed
|
||||
source ~/.nvm/nvm.sh && \
|
||||
nvm install $(NODE_VERSION) && \
|
||||
nvm use $(NODE_VERSION) && \
|
||||
$(MAKE) test_zk_wasm_x86_compat
|
||||
|
||||
.PHONY: test_zk_wasm_x86_compat # Check compatibility between wasm and x86_64 proofs
|
||||
test_zk_wasm_x86_compat: install_rs_build_toolchain build_node_js_api
|
||||
cd tfhe/tests/zk_wasm_x86_test && npm install
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
-p tfhe --test zk_wasm_x86_test --features=$(TARGET_ARCH_FEATURE),integer,zk-pok
|
||||
|
||||
.PHONY: test_versionable # Run tests for tfhe-versionable subcrate
|
||||
test_versionable: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
@@ -979,7 +1016,7 @@ no_dbg_log:
|
||||
@./scripts/no_dbg_calls.sh
|
||||
|
||||
.PHONY: dieharder_csprng # Run the dieharder test suite on our CSPRNG implementation
|
||||
dieharder_csprng: install_dieharder build_concrete_csprng
|
||||
dieharder_csprng: install_dieharder build_tfhe_csprng
|
||||
./scripts/dieharder_test.sh
|
||||
|
||||
#
|
||||
@@ -993,40 +1030,42 @@ print_doc_bench_parameters:
|
||||
|
||||
.PHONY: bench_integer # Run benchmarks for unsigned integer
|
||||
bench_integer: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_signed_integer # Run benchmarks for signed integer
|
||||
bench_signed_integer: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-signed-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
|
||||
bench_integer_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
|
||||
bench_integer_compression: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench glwe_packing_compression-integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_compression_gpu
|
||||
bench_integer_compression_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench glwe_packing_compression-integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
|
||||
bench_integer_multi_bit: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
@@ -1034,7 +1073,7 @@ bench_integer_multi_bit: install_rs_check_toolchain
|
||||
|
||||
.PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
|
||||
bench_signed_integer_multi_bit: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-signed-bench \
|
||||
@@ -1042,23 +1081,23 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain
|
||||
|
||||
.PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
|
||||
bench_integer_multi_bit_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT \
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_unsigned_integer_multi_bit_gpu # Run benchmarks for unsigned integer on GPU backend using multi-bit parameters
|
||||
bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT \
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- ::unsigned
|
||||
|
||||
.PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
|
||||
bench_integer_zk: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench zk-pke-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,zk-pok,nightly-avx512 \
|
||||
@@ -1080,7 +1119,7 @@ bench_shortint_oprf: install_rs_check_toolchain
|
||||
|
||||
.PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
|
||||
bench_shortint_multi_bit: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT \
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench shortint-bench \
|
||||
@@ -1164,6 +1203,11 @@ bench_hlapi_erc20_gpu: install_rs_check_toolchain
|
||||
--bench hlapi-erc20 \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
|
||||
bench_tfhe_zk_pok: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --
|
||||
|
||||
#
|
||||
# Utility tools
|
||||
#
|
||||
@@ -1262,6 +1306,189 @@ check_compile_tests
|
||||
.PHONY: conformance # Automatically fix problems that can be fixed
|
||||
conformance: fix_newline fmt
|
||||
|
||||
#=============================== FFT Section ==================================
|
||||
.PHONY: doc_fft # Build rust doc for tfhe-fft
|
||||
doc_fft: install_rs_check_toolchain
|
||||
@# Even though we are not in docs.rs, this allows to "just" build the doc
|
||||
DOCS_RS=1 \
|
||||
RUSTDOCFLAGS="--html-in-header katex-header.html" \
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
|
||||
--all-features --no-deps -p tfhe-fft
|
||||
|
||||
.PHONY: docs_fft # Build rust doc tfhe-fft, alias for doc
|
||||
docs_fft: doc_fft
|
||||
|
||||
.PHONY: lint_doc_fft # Build rust doc for tfhe-fft with linting enabled
|
||||
lint_doc_fft: install_rs_check_toolchain
|
||||
@# Even though we are not in docs.rs, this allows to "just" build the doc
|
||||
DOCS_RS=1 \
|
||||
RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
|
||||
--all-features --no-deps -p tfhe-fft
|
||||
|
||||
.PHONY: lint_docs_fft # Build rust doc for tfhe-fft with linting enabled, alias for lint_doc
|
||||
lint_docs_fft: lint_doc_fft
|
||||
|
||||
.PHONY: clippy_fft # Run clippy lints on tfhe-fft
|
||||
clippy_fft: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
--all-features -p tfhe-fft -- --no-deps -D warnings
|
||||
|
||||
.PHONY: pcc_fft # pcc stands for pre commit checks
|
||||
pcc_fft: check_fmt lint_doc_fft clippy_fft
|
||||
|
||||
.PHONY: build_fft
|
||||
build_fft: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --release -p tfhe-fft
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --release -p tfhe-fft \
|
||||
--features=fft128
|
||||
|
||||
.PHONY: build_fft_no_std
|
||||
build_fft_no_std: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --release -p tfhe-fft \
|
||||
--no-default-features
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --release -p tfhe-fft \
|
||||
--no-default-features \
|
||||
--features=fft128
|
||||
|
||||
##### Tests #####
|
||||
|
||||
.PHONY: test_fft
|
||||
test_fft: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-fft
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-fft \
|
||||
--features=fft128
|
||||
|
||||
.PHONY: test_fft_serde
|
||||
test_fft_serde: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-fft \
|
||||
--features=serde
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-fft \
|
||||
--features=serde,fft128
|
||||
|
||||
.PHONY: test_fft_nightly
|
||||
test_fft_nightly: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --release -p tfhe-fft \
|
||||
--features=nightly
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --release -p tfhe-fft \
|
||||
--features=nightly,fft128
|
||||
|
||||
.PHONY: test_fft_no_std
|
||||
test_fft_no_std: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-fft \
|
||||
--no-default-features
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-fft \
|
||||
--no-default-features \
|
||||
--features=fft128
|
||||
|
||||
.PHONY: test_fft_no_std_nightly
|
||||
test_fft_no_std_nightly: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --release -p tfhe-fft \
|
||||
--no-default-features \
|
||||
--features=nightly
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --release -p tfhe-fft \
|
||||
--no-default-features \
|
||||
--features=nightly,fft128
|
||||
|
||||
.PHONY: test_fft_node_js
|
||||
test_fft_node_js: install_rs_build_toolchain install_build_wasm32_target install_wasm_bindgen_cli
|
||||
RUSTFLAGS="" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release \
|
||||
--features=serde --target wasm32-unknown-unknown -p tfhe-fft
|
||||
|
||||
.PHONY: test_fft_node_js_ci
|
||||
test_fft_node_js_ci: check_nvm_installed
|
||||
source ~/.nvm/nvm.sh && \
|
||||
nvm install $(NODE_VERSION) && \
|
||||
nvm use $(NODE_VERSION) && \
|
||||
"$(MAKE)" test_fft_node_js
|
||||
|
||||
.PHONY: test_fft_all
|
||||
test_fft_all: test_fft test_fft_serde test_fft_nightly test_fft_no_std test_fft_no_std_nightly \
|
||||
test_fft_node_js_ci
|
||||
|
||||
##### Bench #####
|
||||
|
||||
.PHONY: bench_fft # Run FFT benchmarks
|
||||
bench_fft: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" bench --bench fft -p tfhe-fft \
|
||||
--features=serde \
|
||||
--features=nightly \
|
||||
--features=fft128
|
||||
#============================End FFT Section ==================================
|
||||
|
||||
#=============================== NTT Section ==================================
|
||||
.PHONY: doc_ntt # Build rust doc for tfhe-ntt
|
||||
doc_ntt: install_rs_check_toolchain
|
||||
@# Even though we are not in docs.rs, this allows to "just" build the doc
|
||||
DOCS_RS=1 \
|
||||
RUSTDOCFLAGS="--html-in-header katex-header.html" \
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
|
||||
--all-features --no-deps -p tfhe-ntt
|
||||
|
||||
.PHONY: docs_ntt # Build rust doc tfhe-ntt, alias for doc
|
||||
docs_ntt: doc_ntt
|
||||
|
||||
.PHONY: lint_doc_ntt # Build rust doc for tfhe-ntt with linting enabled
|
||||
lint_doc_ntt: install_rs_check_toolchain
|
||||
@# Even though we are not in docs.rs, this allows to "just" build the doc
|
||||
DOCS_RS=1 \
|
||||
RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
|
||||
--all-features --no-deps -p tfhe-ntt
|
||||
|
||||
.PHONY: lint_docs_ntt # Build rust doc for tfhe-ntt with linting enabled, alias for lint_doc
|
||||
lint_docs_ntt: lint_doc_ntt
|
||||
|
||||
.PHONY: clippy_ntt # Run clippy lints on tfhe-ntt
|
||||
clippy_ntt: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
--all-features -p tfhe-ntt -- --no-deps -D warnings
|
||||
|
||||
.PHONY: pcc_ntt # pcc stands for pre commit checks
|
||||
pcc_ntt: check_fmt lint_doc_ntt clippy_ntt
|
||||
|
||||
.PHONY: build_ntt
|
||||
build_ntt: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --release -p tfhe-ntt
|
||||
|
||||
.PHONY: build_ntt_no_std
|
||||
build_ntt_no_std: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --release -p tfhe-ntt \
|
||||
--no-default-features
|
||||
|
||||
##### Tests #####
|
||||
|
||||
.PHONY: test_ntt
|
||||
test_ntt: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-ntt
|
||||
|
||||
.PHONY: test_ntt_nightly
|
||||
test_ntt_nightly: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --release -p tfhe-ntt \
|
||||
--features=nightly
|
||||
|
||||
.PHONY: test_ntt_no_std
|
||||
test_ntt_no_std: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --release -p tfhe-ntt \
|
||||
--no-default-features
|
||||
|
||||
.PHONY: test_ntt_no_std_nightly
|
||||
test_ntt_no_std_nightly: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --release -p tfhe-ntt \
|
||||
--no-default-features \
|
||||
--features=nightly
|
||||
|
||||
.PHONY: test_ntt_all
|
||||
test_ntt_all: test_ntt test_ntt_no_std test_ntt_nightly test_ntt_no_std_nightly
|
||||
|
||||
##### Bench #####
|
||||
|
||||
.PHONY: bench_ntt # Run NTT benchmarks
|
||||
bench_ntt: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" bench --bench ntt -p tfhe-ntt \
|
||||
--features=nightly
|
||||
#============================End NTT Section ==================================
|
||||
|
||||
.PHONY: help # Generate list of targets with descriptions
|
||||
help:
|
||||
@grep '^\.PHONY: .* #' Makefile | sed 's/\.PHONY: \(.*\) # \(.*\)/\1\t\2/' | expand -t30 | sort
|
||||
|
||||
@@ -89,7 +89,7 @@ tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64"]
|
||||
```
|
||||
|
||||
> [!Note]
|
||||
> Note: You need to use a Rust version >= 1.73 to compile TFHE-rs.
|
||||
> Note: You need to use a Rust version >= 1.81 to compile TFHE-rs.
|
||||
|
||||
> [!Note]
|
||||
> Note: aarch64-based machines are not yet supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tfhe-cuda-backend"
|
||||
version = "0.5.0"
|
||||
version = "0.6.0"
|
||||
edition = "2021"
|
||||
authors = ["Zama team"]
|
||||
license = "BSD-3-Clause-Clear"
|
||||
|
||||
@@ -62,6 +62,7 @@ fn main() {
|
||||
"cuda/include/integer/integer.h",
|
||||
"cuda/include/keyswitch.h",
|
||||
"cuda/include/linear_algebra.h",
|
||||
"cuda/include/pbs/fft.h",
|
||||
"cuda/include/pbs/programmable_bootstrap.h",
|
||||
"cuda/include/pbs/programmable_bootstrap_multibit.h",
|
||||
];
|
||||
|
||||
@@ -83,7 +83,7 @@ endif()
|
||||
set(CMAKE_CUDA_FLAGS
|
||||
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
|
||||
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true \
|
||||
--use_fast_math -Xcompiler -fPIC")
|
||||
--use_fast_math -Xcompiler -fPIC --ptxas-options=-v")
|
||||
|
||||
set(INCLUDE_DIR include)
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
|
||||
@@ -103,17 +103,19 @@ void cleanup_cuda_full_propagation(void *const *streams,
|
||||
|
||||
void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log,
|
||||
uint32_t ks_level, uint32_t grouping_factor, uint32_t num_blocks,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
int8_t **mem_ptr, bool const is_boolean_left, bool const is_boolean_right,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
|
||||
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void const *radix_lwe_left,
|
||||
void const *radix_lwe_right, void *const *bsks, void *const *ksks,
|
||||
int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks);
|
||||
void *radix_lwe_out, void const *radix_lwe_left, bool const is_bool_left,
|
||||
void const *radix_lwe_right, bool const is_bool_right, void *const *bsks,
|
||||
void *const *ksks, int8_t *mem_ptr, uint32_t polynomial_size,
|
||||
uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_integer_mult(void *const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -297,47 +299,6 @@ void cleanup_cuda_propagate_single_carry(void *const *streams,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_fast_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
|
||||
uint32_t uses_carry, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_fast_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks, uint32_t num_blocks,
|
||||
uint32_t requested_flag, uint32_t uses_carry);
|
||||
|
||||
void cleanup_cuda_fast_propagate_single_carry(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lhs_array, const void *rhs_array, void *overflow_block,
|
||||
const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
|
||||
uint32_t uses_input_borrow);
|
||||
|
||||
void cleanup_cuda_integer_overflowing_sub(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
@@ -398,16 +359,17 @@ void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,
|
||||
|
||||
void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
bool is_signed, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *quotient, void *remainder, void const *numerator, void const *divisor,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
bool is_signed, int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
uint32_t num_blocks_in_radix);
|
||||
|
||||
void cleanup_cuda_integer_div_rem(void *const *streams,
|
||||
@@ -458,5 +420,24 @@ void cuda_integer_reverse_blocks_64_inplace(void *const *streams,
|
||||
uint32_t num_blocks,
|
||||
uint32_t lwe_size);
|
||||
|
||||
void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, bool is_signed, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_integer_abs_inplace(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
} // extern C
|
||||
#endif // CUDA_INTEGER_H
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -27,12 +27,6 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
|
||||
void const *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_64_with_packing(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in_1, void const *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus);
|
||||
|
||||
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in, void const *plaintext_array_in,
|
||||
|
||||
17
backends/tfhe-cuda-backend/cuda/include/pbs/fft.h
Normal file
17
backends/tfhe-cuda-backend/cuda/include/pbs/fft.h
Normal file
@@ -0,0 +1,17 @@
|
||||
#include <stdint.h>
|
||||
extern "C" {
|
||||
void cuda_fourier_transform_forward_as_torus_f128_async(
|
||||
void *stream, uint32_t gpu_index, void *re0, void *re1, void *im0,
|
||||
void *im1, void const *standard, uint32_t const N,
|
||||
const uint32_t number_of_samples);
|
||||
|
||||
void cuda_fourier_transform_forward_as_integer_f128_async(
|
||||
void *stream, uint32_t gpu_index, void *re0, void *re1, void *im0,
|
||||
void *im1, void const *standard, uint32_t const N,
|
||||
const uint32_t number_of_samples);
|
||||
|
||||
void cuda_fourier_transform_backward_as_torus_f128_async(
|
||||
void *stream, uint32_t gpu_index, void *standard, void const *re0,
|
||||
void const *re1, void const *im0, void const *im1, uint32_t const N,
|
||||
const uint32_t number_of_samples);
|
||||
}
|
||||
@@ -106,7 +106,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
uint32_t lwe_chunk_size;
|
||||
double2 *keybundle_fft;
|
||||
Torus *global_accumulator;
|
||||
double2 *global_accumulator_fft;
|
||||
double2 *global_join_buffer;
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
|
||||
@@ -225,10 +225,12 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
num_blocks_keybundle * (polynomial_size / 2) * sizeof(double2),
|
||||
stream, gpu_index);
|
||||
global_accumulator = (Torus *)cuda_malloc_async(
|
||||
num_blocks_acc_step_one * polynomial_size * sizeof(Torus), stream,
|
||||
gpu_index);
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
num_blocks_acc_step_one * (polynomial_size / 2) * sizeof(double2),
|
||||
input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(Torus),
|
||||
stream, gpu_index);
|
||||
global_join_buffer = (double2 *)cuda_malloc_async(
|
||||
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
(polynomial_size / 2) * sizeof(double2),
|
||||
stream, gpu_index);
|
||||
}
|
||||
}
|
||||
@@ -260,7 +262,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
|
||||
cuda_drop_async(keybundle_fft, stream, gpu_index);
|
||||
cuda_drop_async(global_accumulator, stream, gpu_index);
|
||||
cuda_drop_async(global_accumulator_fft, stream, gpu_index);
|
||||
cuda_drop_async(global_join_buffer, stream, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -69,7 +69,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
int8_t *d_mem;
|
||||
|
||||
Torus *global_accumulator;
|
||||
double2 *global_accumulator_fft;
|
||||
double2 *global_join_buffer;
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
|
||||
@@ -114,7 +114,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
|
||||
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
global_join_buffer = (double2 *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
|
||||
(polynomial_size / 2) * sizeof(double2),
|
||||
stream, gpu_index);
|
||||
@@ -147,7 +147,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
|
||||
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
global_join_buffer = (double2 *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double2),
|
||||
stream, gpu_index);
|
||||
@@ -194,7 +194,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
|
||||
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
global_join_buffer = (double2 *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double2),
|
||||
stream, gpu_index);
|
||||
@@ -208,7 +208,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
|
||||
void release(cudaStream_t stream, uint32_t gpu_index) {
|
||||
cuda_drop_async(d_mem, stream, gpu_index);
|
||||
cuda_drop_async(global_accumulator_fft, stream, gpu_index);
|
||||
cuda_drop_async(global_join_buffer, stream, gpu_index);
|
||||
|
||||
if (pbs_variant == DEFAULT)
|
||||
cuda_drop_async(global_accumulator, stream, gpu_index);
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#ifndef CNCRT_CRYPTO_CUH
|
||||
#define CNCRT_CRPYTO_CUH
|
||||
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
#include <cstdint>
|
||||
|
||||
@@ -21,7 +22,6 @@ private:
|
||||
uint32_t base_log;
|
||||
uint32_t mask;
|
||||
uint32_t num_poly;
|
||||
int current_level;
|
||||
T mask_mod_b;
|
||||
T *state;
|
||||
|
||||
@@ -32,13 +32,6 @@ public:
|
||||
state(state) {
|
||||
|
||||
mask_mod_b = (1ll << base_log) - 1ll;
|
||||
current_level = level_count;
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < num_poly * params::opt; i++) {
|
||||
state[tid] >>= (sizeof(T) * 8 - base_log * level_count);
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
// Decomposes all polynomials at once
|
||||
@@ -52,28 +45,30 @@ public:
|
||||
// Decomposes a single polynomial
|
||||
__device__ void decompose_and_compress_next_polynomial(double2 *result,
|
||||
int j) {
|
||||
if (j == 0)
|
||||
current_level -= 1;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
auto state_slice = state + j * params::degree;
|
||||
uint32_t tid = threadIdx.x;
|
||||
auto state_slice = &state[j * params::degree];
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
T res_re = state_slice[tid] & mask_mod_b;
|
||||
T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
|
||||
state_slice[tid] >>= base_log;
|
||||
state_slice[tid + params::degree / 2] >>= base_log;
|
||||
T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
|
||||
T carry_im =
|
||||
((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
|
||||
auto input1 = &state_slice[tid];
|
||||
auto input2 = &state_slice[tid + params::degree / 2];
|
||||
T res_re = *input1 & mask_mod_b;
|
||||
T res_im = *input2 & mask_mod_b;
|
||||
|
||||
*input1 >>= base_log; // Update state
|
||||
*input2 >>= base_log; // Update state
|
||||
|
||||
T carry_re = ((res_re - 1ll) | *input1) & res_re;
|
||||
T carry_im = ((res_im - 1ll) | *input2) & res_im;
|
||||
carry_re >>= (base_log - 1);
|
||||
carry_im >>= (base_log - 1);
|
||||
state_slice[tid] += carry_re;
|
||||
state_slice[tid + params::degree / 2] += carry_im;
|
||||
|
||||
*input1 += carry_re; // Update state
|
||||
*input2 += carry_im; // Update state
|
||||
|
||||
res_re -= carry_re << base_log;
|
||||
res_im -= carry_im << base_log;
|
||||
|
||||
result[tid].x = (int32_t)res_re;
|
||||
result[tid].y = (int32_t)res_im;
|
||||
typecast_torus_to_double(res_re, result[tid].x);
|
||||
typecast_torus_to_double(res_im, result[tid].y);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
@@ -71,12 +71,10 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
|
||||
// This loop distribution seems to benefit the global mem reads
|
||||
for (int i = start_i; i < end_i; i++) {
|
||||
Torus a_i = round_to_closest_multiple(block_lwe_array_in[i], base_log,
|
||||
level_count);
|
||||
Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
|
||||
Torus state =
|
||||
init_decomposer_state(block_lwe_array_in[i], base_log, level_count);
|
||||
|
||||
for (int j = level_count - 1; j >= 0; j--) {
|
||||
// Levels are stored in reverse order
|
||||
for (int j = 0; j < level_count; j++) {
|
||||
auto ksk_block =
|
||||
get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
|
||||
Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
|
||||
@@ -202,15 +200,13 @@ __device__ void packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
|
||||
// Iterate through all lwe elements
|
||||
for (int i = 0; i < lwe_dimension_in; i++) {
|
||||
// Round and prepare decomposition
|
||||
Torus a_i = round_to_closest_multiple(lwe_in[i], base_log, level_count);
|
||||
Torus state = init_decomposer_state(lwe_in[i], base_log, level_count);
|
||||
|
||||
Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
|
||||
Torus mod_b_mask = (1ll << base_log) - 1ll;
|
||||
|
||||
// block of key for current lwe coefficient (cur_input_lwe[i])
|
||||
auto ksk_block = &fp_ksk[i * ksk_block_size];
|
||||
for (int j = level_count - 1; j >= 0; j--) {
|
||||
// Levels are stored in reverse order
|
||||
for (int j = 0; j < level_count; j++) {
|
||||
auto ksk_glwe = &ksk_block[j * glwe_size * polynomial_size];
|
||||
// Iterate through each level and multiply by the ksk piece
|
||||
auto ksk_glwe_chunk = &ksk_glwe[poly_id * coef_per_block];
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#ifndef CNCRT_TORUS_CUH
|
||||
#define CNCRT_TORUS_CUH
|
||||
|
||||
#include "device.h"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "types/int128.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
@@ -11,6 +12,11 @@ __host__ __device__ __forceinline__ constexpr double get_two_pow_torus_bits() {
|
||||
return (sizeof(T) == 4) ? 4294967296.0 : 18446744073709551616.0;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ __device__ __forceinline__ constexpr T scalar_max() {
|
||||
return std::numeric_limits<T>::max();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ inline void typecast_double_to_torus(double x, T &r) {
|
||||
r = T(x);
|
||||
@@ -44,14 +50,36 @@ __device__ inline void typecast_double_round_to_torus(double x, T &r) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
|
||||
uint32_t level_count) {
|
||||
const T non_rep_bit_count = sizeof(T) * 8 - level_count * base_log;
|
||||
const T shift = non_rep_bit_count - 1;
|
||||
T res = x >> shift;
|
||||
res += 1;
|
||||
res &= (T)(-2);
|
||||
return res << shift;
|
||||
__device__ inline void typecast_torus_to_double(T x, double &r);
|
||||
|
||||
template <>
|
||||
__device__ inline void typecast_torus_to_double<uint32_t>(uint32_t x,
|
||||
double &r) {
|
||||
r = __int2double_rn(x);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ inline void typecast_torus_to_double<uint64_t>(uint64_t x,
|
||||
double &r) {
|
||||
r = __ll2double_rn(x);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ inline T init_decomposer_state(T input, uint32_t base_log,
|
||||
uint32_t level_count) {
|
||||
const T rep_bit_count = level_count * base_log;
|
||||
const T non_rep_bit_count = sizeof(T) * 8 - rep_bit_count;
|
||||
T res = input >> (non_rep_bit_count - 1);
|
||||
T rounding_bit = res & (T)(1);
|
||||
res++;
|
||||
res >>= 1;
|
||||
T torus_max = scalar_max<T>();
|
||||
T mod_mask = torus_max >> non_rep_bit_count;
|
||||
res &= mod_mask;
|
||||
T shifted_random = rounding_bit << (rep_bit_count - 1);
|
||||
T need_balance =
|
||||
(((res - (T)(1)) | shifted_random) & res) >> (rep_bit_count - 1);
|
||||
return res - (need_balance << rep_bit_count);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
||||
@@ -97,7 +97,7 @@ bool cuda_check_support_thread_block_clusters() {
|
||||
}
|
||||
|
||||
/// Copy memory to the GPU asynchronously
|
||||
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index) {
|
||||
if (size == 0)
|
||||
return;
|
||||
@@ -268,17 +268,20 @@ void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
|
||||
/// Get the maximum size for the shared memory
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index) {
|
||||
int max_shared_memory = 0;
|
||||
cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
|
||||
gpu_index);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
#if CUDA_ARCH == 900
|
||||
max_shared_memory = 226000;
|
||||
#elif CUDA_ARCH == 890
|
||||
max_shared_memory = 100000;
|
||||
#elif CUDA_ARCH == 860
|
||||
max_shared_memory = 100000;
|
||||
#elif CUDA_ARCH == 800
|
||||
max_shared_memory = 163000;
|
||||
#elif CUDA_ARCH == 700
|
||||
max_shared_memory = 95000;
|
||||
#else
|
||||
cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
|
||||
gpu_index);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
#endif
|
||||
return max_shared_memory;
|
||||
}
|
||||
|
||||
370
backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
Normal file
370
backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
Normal file
@@ -0,0 +1,370 @@
|
||||
|
||||
#ifndef TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_F128_CUH_
|
||||
#define TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_F128_CUH_
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
struct alignas(16) f128 {
|
||||
double hi;
|
||||
double lo;
|
||||
|
||||
// Default and parameterized constructors
|
||||
__host__ __device__ f128() : hi(0.0), lo(0.0) {}
|
||||
__host__ __device__ f128(double high, double low) : hi(high), lo(low) {}
|
||||
|
||||
// Quick two-sum
|
||||
__host__ __device__ __forceinline__ static f128 quick_two_sum(double a,
|
||||
double b) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
double s = __dadd_rn(a, b);
|
||||
return f128(s, __dsub_rn(b, __dsub_rn(s, a)));
|
||||
#else
|
||||
double s = a + b;
|
||||
return f128(s, b - (s - a));
|
||||
#endif;
|
||||
}
|
||||
|
||||
// Two-sum
|
||||
__host__ __device__ __forceinline__ static f128 two_sum(double a, double b) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
double s = __dadd_rn(a, b);
|
||||
double bb = __dsub_rn(s, a);
|
||||
return f128(s, __dadd_rn(__dsub_rn(a, __dsub_rn(s, bb)), __dsub_rn(b, bb)));
|
||||
#else
|
||||
double s = a + b;
|
||||
double bb = s - a;
|
||||
return f128(s, (a - (s - bb)) + (b - bb));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Two-product
|
||||
__host__ __device__ __forceinline__ static f128 two_prod(double a, double b) {
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
double p = __dmul_rn(a, b);
|
||||
double p2 = __fma_rn(a, b, -p);
|
||||
#else
|
||||
double p = a * b;
|
||||
double p2 = fma(a, b, -p);
|
||||
#endif
|
||||
return f128(p, p2);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ static f128 two_diff(double a, double b) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
double s = __dsub_rn(a, b);
|
||||
double bb = __dsub_rn(s, a);
|
||||
return f128(s, __dsub_rn(__dsub_rn(a, __dsub_rn(s, bb)), __dadd_rn(b, bb)));
|
||||
#else
|
||||
double s = a - b;
|
||||
double bb = s - a;
|
||||
return f128(s, (a - (s - bb)) - (b + bb));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Addition
|
||||
__host__ __device__ static f128 add(const f128 &a, const f128 &b) {
|
||||
auto s = two_sum(a.hi, b.hi);
|
||||
auto t = two_sum(a.lo, b.lo);
|
||||
|
||||
double hi = s.hi;
|
||||
double lo = s.lo + t.hi;
|
||||
hi = hi + lo;
|
||||
lo = lo - (hi - s.hi);
|
||||
|
||||
return f128(hi, lo + t.lo);
|
||||
}
|
||||
|
||||
// Addition with estimate
|
||||
__host__ __device__ static f128 add_estimate(const f128 &a, const f128 &b) {
|
||||
auto se = two_sum(a.hi, b.hi);
|
||||
#ifdef __CUDA_ARCH__
|
||||
se.lo = __dadd_rn(se.lo, __dadd_rn(a.lo, b.lo));
|
||||
#else
|
||||
se.lo += (a.lo + b.lo);
|
||||
#endif
|
||||
|
||||
return quick_two_sum(se.hi, se.lo);
|
||||
}
|
||||
|
||||
// Subtraction with estimate
|
||||
__host__ __device__ static f128 sub_estimate(const f128 &a, const f128 &b) {
|
||||
f128 se = two_diff(a.hi, b.hi);
|
||||
#ifdef __CUDA_ARCH__
|
||||
se.lo = __dadd_rn(se.lo, a.lo);
|
||||
se.lo = __dsub_rn(se.lo, b.lo);
|
||||
#else
|
||||
se.lo += a.lo;
|
||||
se.lo -= b.lo;
|
||||
#endif
|
||||
return quick_two_sum(se.hi, se.lo);
|
||||
}
|
||||
|
||||
// Subtraction
|
||||
__host__ __device__ static f128 sub(const f128 &a, const f128 &b) {
|
||||
auto s = two_diff(a.hi, b.hi);
|
||||
auto t = two_diff(a.lo, b.lo);
|
||||
s = quick_two_sum(s.hi, s.lo + t.hi);
|
||||
return quick_two_sum(s.hi, s.lo + t.lo);
|
||||
}
|
||||
|
||||
// Multiplication
|
||||
__host__ __device__ static f128 mul(const f128 &a, const f128 &b) {
|
||||
auto p = two_prod(a.hi, b.hi);
|
||||
#ifdef __CUDA_ARCH__
|
||||
double a_0_x_b_1 = __dmul_rn(a.hi, b.lo);
|
||||
double a_1_x_b_0 = __dmul_rn(a.lo, b.hi);
|
||||
p.lo = __dadd_rn(p.lo, __dadd_rn(a_0_x_b_1, a_1_x_b_0));
|
||||
#else
|
||||
p.lo += (a.hi * b.lo + a.lo * b.hi);
|
||||
#endif
|
||||
p = quick_two_sum(p.hi, p.lo);
|
||||
return p;
|
||||
}
|
||||
|
||||
__host__ __device__ static void
|
||||
cplx_f128_mul_assign(f128 &c_re, f128 &c_im, const f128 &a_re,
|
||||
const f128 &a_im, const f128 &b_re, const f128 &b_im) {
|
||||
auto a_re_x_b_re = mul(a_re, b_re);
|
||||
auto a_re_x_b_im = mul(a_re, b_im);
|
||||
auto a_im_x_b_re = mul(a_im, b_re);
|
||||
auto a_im_x_b_im = mul(a_im, b_im);
|
||||
|
||||
c_re = sub_estimate(a_re_x_b_re, a_im_x_b_im);
|
||||
c_im = add_estimate(a_im_x_b_re, a_re_x_b_im);
|
||||
}
|
||||
|
||||
__host__ __device__ static void
|
||||
cplx_f128_sub_assign(f128 &c_re, f128 &c_im, const f128 &a_re,
|
||||
const f128 &a_im, const f128 &b_re, const f128 &b_im) {
|
||||
c_re = sub_estimate(a_re, b_re);
|
||||
c_im = sub_estimate(a_im, b_im);
|
||||
}
|
||||
__host__ __device__ static void
|
||||
cplx_f128_add_assign(f128 &c_re, f128 &c_im, const f128 &a_re,
|
||||
const f128 &a_im, const f128 &b_re, const f128 &b_im) {
|
||||
c_re = add_estimate(a_re, b_re);
|
||||
c_im = add_estimate(a_im, b_im);
|
||||
}
|
||||
};
|
||||
|
||||
struct f128x2 {
|
||||
f128 re;
|
||||
f128 im;
|
||||
|
||||
__host__ __device__ f128x2() : re(), im() {}
|
||||
|
||||
__host__ __device__ f128x2(const f128 &real, const f128 &imag)
|
||||
: re(real), im(imag) {}
|
||||
|
||||
__host__ __device__ f128x2(double real, double imag)
|
||||
: re(real, 0.0), im(imag, 0.0) {}
|
||||
|
||||
__host__ __device__ explicit f128x2(double real)
|
||||
: re(real, 0.0), im(0.0, 0.0) {}
|
||||
|
||||
__host__ __device__ f128x2(const f128x2 &other)
|
||||
: re(other.re), im(other.im) {}
|
||||
|
||||
__host__ __device__ f128x2(f128x2 &&other) noexcept
|
||||
: re(std::move(other.re)), im(std::move(other.im)) {}
|
||||
|
||||
__host__ __device__ f128x2 &operator=(const f128x2 &other) {
|
||||
if (this != &other) {
|
||||
re = other.re;
|
||||
im = other.im;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
__host__ __device__ f128x2 &operator=(f128x2 &&other) noexcept {
|
||||
if (this != &other) {
|
||||
re = std::move(other.re);
|
||||
im = std::move(other.im);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
__host__ __device__ f128x2 conjugate() const {
|
||||
return f128x2(re, f128(-im.hi, -im.lo));
|
||||
}
|
||||
|
||||
__host__ __device__ f128 norm_squared() const {
|
||||
return f128::add(f128::mul(re, re), f128::mul(im, im));
|
||||
}
|
||||
|
||||
__host__ __device__ void zero() {
|
||||
re = f128(0.0, 0.0);
|
||||
im = f128(0.0, 0.0);
|
||||
}
|
||||
|
||||
// Addition
|
||||
__host__ __device__ friend f128x2 operator+(const f128x2 &a,
|
||||
const f128x2 &b) {
|
||||
return f128x2(f128::add(a.re, b.re), f128::add(a.im, b.im));
|
||||
}
|
||||
|
||||
// Subtraction
|
||||
__host__ __device__ friend f128x2 operator-(const f128x2 &a,
|
||||
const f128x2 &b) {
|
||||
return f128x2(f128::add(a.re, f128(-b.re.hi, -b.re.lo)),
|
||||
f128::add(a.im, f128(-b.im.hi, -b.im.lo)));
|
||||
}
|
||||
|
||||
// Multiplication (complex multiplication)
|
||||
__host__ __device__ friend f128x2 operator*(const f128x2 &a,
|
||||
const f128x2 &b) {
|
||||
f128 real_part =
|
||||
f128::add(f128::mul(a.re, b.re),
|
||||
f128(-f128::mul(a.im, b.im).hi, -f128::mul(a.im, b.im).lo));
|
||||
f128 imag_part = f128::add(f128::mul(a.re, b.im), f128::mul(a.im, b.re));
|
||||
return f128x2(real_part, imag_part);
|
||||
}
|
||||
|
||||
// Addition-assignment operator
|
||||
__host__ __device__ f128x2 &operator+=(const f128x2 &other) {
|
||||
re = f128::add(re, other.re);
|
||||
im = f128::add(im, other.im);
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Subtraction-assignment operator
|
||||
__host__ __device__ f128x2 &operator-=(const f128x2 &other) {
|
||||
re = f128::add(re, f128(-other.re.hi, -other.re.lo));
|
||||
im = f128::add(im, f128(-other.im.hi, -other.im.lo));
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Multiplication-assignment operator
|
||||
__host__ __device__ f128x2 &operator*=(const f128x2 &other) {
|
||||
f128 new_re =
|
||||
f128::add(f128::mul(re, other.re), f128(-f128::mul(im, other.im).hi,
|
||||
-f128::mul(im, other.im).lo));
|
||||
f128 new_im = f128::add(f128::mul(re, other.im), f128::mul(im, other.re));
|
||||
re = new_re;
|
||||
im = new_im;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
__host__ __device__ inline uint64_t double_to_bits(double d) {
|
||||
uint64_t bits = *reinterpret_cast<uint64_t *>(&d);
|
||||
return bits;
|
||||
}
|
||||
|
||||
__host__ __device__ inline double bits_to_double(uint64_t bits) {
|
||||
double d = *reinterpret_cast<double *>(&bits);
|
||||
return d;
|
||||
}
|
||||
|
||||
__host__ __device__ double u128_to_f64(__uint128_t x) {
|
||||
const __uint128_t ONE = 1;
|
||||
const double A = ONE << 52;
|
||||
const double B = ONE << 104;
|
||||
const double C = ONE << 76;
|
||||
const double D = 340282366920938500000000000000000000000.;
|
||||
|
||||
const __uint128_t threshold = (ONE << 104);
|
||||
|
||||
if (x < threshold) {
|
||||
uint64_t A_bits = double_to_bits(A);
|
||||
|
||||
__uint128_t shifted = (x << 12);
|
||||
uint64_t lower64 = static_cast<uint64_t>(shifted);
|
||||
lower64 >>= 12;
|
||||
|
||||
uint64_t bits_l = A_bits | lower64;
|
||||
double l_temp = bits_to_double(bits_l);
|
||||
double l = l_temp - A;
|
||||
|
||||
uint64_t B_bits = double_to_bits(B);
|
||||
uint64_t top64 = static_cast<uint64_t>(x >> 52);
|
||||
uint64_t bits_h = B_bits | top64;
|
||||
double h_temp = bits_to_double(bits_h);
|
||||
double h = h_temp - B;
|
||||
|
||||
return (l + h);
|
||||
|
||||
} else {
|
||||
uint64_t C_bits = double_to_bits(C);
|
||||
|
||||
__uint128_t shifted = (x >> 12);
|
||||
uint64_t lower64 = static_cast<uint64_t>(shifted);
|
||||
lower64 >>= 12;
|
||||
|
||||
uint64_t x_lo = static_cast<uint64_t>(x);
|
||||
uint64_t mask_part = (x_lo & 0xFFFFFFULL);
|
||||
|
||||
uint64_t bits_l = C_bits | lower64 | mask_part;
|
||||
double l_temp = bits_to_double(bits_l);
|
||||
double l = l_temp - C;
|
||||
|
||||
uint64_t D_bits = double_to_bits(D);
|
||||
uint64_t top64 = static_cast<uint64_t>(x >> 76);
|
||||
uint64_t bits_h = D_bits | top64;
|
||||
double h_temp = bits_to_double(bits_h);
|
||||
double h = h_temp - D;
|
||||
|
||||
return (l + h);
|
||||
}
|
||||
}
|
||||
|
||||
__host__ __device__ __uint128_t f64_to_u128(const double f) {
|
||||
const __uint128_t ONE = 1;
|
||||
const uint64_t f_bits = double_to_bits(f);
|
||||
if (f_bits < 1023ull << 52) {
|
||||
return 0;
|
||||
} else {
|
||||
const __uint128_t m = ONE << 127 | (__uint128_t)f_bits << 75;
|
||||
const uint64_t s = 1150 - (f_bits >> 52);
|
||||
if (s >= 128) {
|
||||
return 0;
|
||||
} else {
|
||||
return m >> s;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__host__ __device__ double i128_to_f64(__int128_t const x) {
|
||||
uint64_t sign = static_cast<uint64_t>(x >> 64) & (1ULL << 63);
|
||||
__uint128_t abs =
|
||||
(x < 0) ? static_cast<__uint128_t>(-x) : static_cast<__uint128_t>(x);
|
||||
|
||||
return bits_to_double(double_to_bits(u128_to_f64(abs)) | sign);
|
||||
}
|
||||
__host__ __device__ f128 u128_to_signed_to_f128(__uint128_t x) {
|
||||
const double first_approx = i128_to_f64(x);
|
||||
const uint64_t sign_bit = double_to_bits(first_approx) & (1ull << 63);
|
||||
const __uint128_t first_approx_roundtrip =
|
||||
f64_to_u128((first_approx < 0) ? -first_approx : first_approx);
|
||||
const __uint128_t first_approx_roundtrip_signed =
|
||||
(sign_bit == (1ull << 63)) ? -first_approx_roundtrip
|
||||
: first_approx_roundtrip;
|
||||
|
||||
double correction = i128_to_f64(x - first_approx_roundtrip_signed);
|
||||
|
||||
return f128(first_approx, correction);
|
||||
};
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
// Convert __uint128_t to decimal string
|
||||
std::string to_string_128(__uint128_t value) {
|
||||
if (value == 0)
|
||||
return "0";
|
||||
|
||||
std::string result;
|
||||
// Repeatedly divide by 10 and build the number in reverse
|
||||
while (value > 0) {
|
||||
unsigned digit = static_cast<unsigned>(value % 10);
|
||||
result.push_back(static_cast<char>('0' + digit));
|
||||
value /= 10;
|
||||
}
|
||||
|
||||
// The digits are in reverse order, so reverse them
|
||||
std::reverse(result.begin(), result.end());
|
||||
return result;
|
||||
} // TIP To <b>Run</b> code, press <shortcut actionId="Run"/> or
|
||||
|
||||
#endif
|
||||
163
backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cu
Normal file
163
backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cu
Normal file
@@ -0,0 +1,163 @@
|
||||
#include "fft128.cuh"
|
||||
|
||||
void cuda_fourier_transform_forward_as_integer_f128_async(
|
||||
void *stream, uint32_t gpu_index, void *re0, void *re1, void *im0,
|
||||
void *im1, void const *standard, const uint32_t N,
|
||||
const uint32_t number_of_samples) {
|
||||
switch (N) {
|
||||
case 64:
|
||||
host_fourier_transform_forward_as_integer_f128<Degree<64>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 128:
|
||||
host_fourier_transform_forward_as_integer_f128<Degree<128>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 256:
|
||||
host_fourier_transform_forward_as_integer_f128<Degree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_fourier_transform_forward_as_integer_f128<Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_fourier_transform_forward_as_integer_f128<Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_fourier_transform_forward_as_integer_f128<Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_fourier_transform_forward_as_integer_f128<Degree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (f128 fft): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_fourier_transform_forward_as_torus_f128_async(
|
||||
void *stream, uint32_t gpu_index, void *re0, void *re1, void *im0,
|
||||
void *im1, void const *standard, const uint32_t N,
|
||||
const uint32_t number_of_samples) {
|
||||
switch (N) {
|
||||
case 64:
|
||||
host_fourier_transform_forward_as_torus_f128<Degree<64>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 128:
|
||||
host_fourier_transform_forward_as_torus_f128<Degree<128>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 256:
|
||||
host_fourier_transform_forward_as_torus_f128<Degree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_fourier_transform_forward_as_torus_f128<Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_fourier_transform_forward_as_torus_f128<Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_fourier_transform_forward_as_torus_f128<Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_fourier_transform_forward_as_torus_f128<Degree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (f128 fft): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_fourier_transform_backward_as_torus_f128_async(
|
||||
void *stream, uint32_t gpu_index, void *standard, void const *re0,
|
||||
void const *re1, void const *im0, void const *im1, const uint32_t N,
|
||||
const uint32_t number_of_samples) {
|
||||
switch (N) {
|
||||
case 64:
|
||||
host_fourier_transform_backward_as_torus_f128<Degree<64>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
|
||||
(double const *)re0, (double const *)re1, (double const *)im0,
|
||||
(double const *)im1, N, number_of_samples);
|
||||
break;
|
||||
case 128:
|
||||
host_fourier_transform_backward_as_torus_f128<Degree<128>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
|
||||
(double const *)re0, (double const *)re1, (double const *)im0,
|
||||
(double const *)im1, N, number_of_samples);
|
||||
break;
|
||||
case 256:
|
||||
host_fourier_transform_backward_as_torus_f128<Degree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
|
||||
(double const *)re0, (double const *)re1, (double const *)im0,
|
||||
(double const *)im1, N, number_of_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_fourier_transform_backward_as_torus_f128<Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
|
||||
(double const *)re0, (double const *)re1, (double const *)im0,
|
||||
(double const *)im1, N, number_of_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_fourier_transform_backward_as_torus_f128<Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
|
||||
(double const *)re0, (double const *)re1, (double const *)im0,
|
||||
(double const *)im1, N, number_of_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_fourier_transform_backward_as_torus_f128<Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
|
||||
(double const *)re0, (double const *)re1, (double const *)im0,
|
||||
(double const *)im1, N, number_of_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_fourier_transform_backward_as_torus_f128<Degree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
|
||||
(double const *)re0, (double const *)re1, (double const *)im0,
|
||||
(double const *)im1, N, number_of_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (f128 fft): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
760
backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
Normal file
760
backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
Normal file
@@ -0,0 +1,760 @@
|
||||
#ifndef TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_FFT128_CUH_
|
||||
#define TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_FFT128_CUH_
|
||||
|
||||
#include "f128.cuh"
|
||||
#include "pbs/fft.h"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "twiddles.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
#include <iostream>
|
||||
|
||||
using Index = unsigned;
|
||||
|
||||
#define NEG_TWID(i) \
|
||||
f128x2(f128(neg_twiddles_re_hi[(i)], neg_twiddles_re_lo[(i)]), \
|
||||
f128(neg_twiddles_im_hi[(i)], neg_twiddles_im_lo[(i)]))
|
||||
|
||||
#define F64x4_TO_F128x2(f128x2_reg, ind) \
|
||||
f128x2_reg.re.hi = dt_re_hi[ind]; \
|
||||
f128x2_reg.re.lo = dt_re_lo[ind]; \
|
||||
f128x2_reg.im.hi = dt_im_hi[ind]; \
|
||||
f128x2_reg.im.lo = dt_im_lo[ind]
|
||||
|
||||
#define F128x2_TO_F64x4(f128x2_reg, ind) \
|
||||
dt_re_hi[ind] = f128x2_reg.re.hi; \
|
||||
dt_re_lo[ind] = f128x2_reg.re.lo; \
|
||||
dt_im_hi[ind] = f128x2_reg.im.hi; \
|
||||
dt_im_lo[ind] = f128x2_reg.im.lo
|
||||
|
||||
// zl - left part of butterfly operation
|
||||
// zr - right part of butterfly operation
|
||||
// re - real part
|
||||
// im - imaginary part
|
||||
// hi - high bits
|
||||
// lo - low bits
|
||||
// dt - list
|
||||
// cf - single coefficient
|
||||
template <class params>
|
||||
__device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
|
||||
double *dt_im_hi,
|
||||
double *dt_im_lo) {
|
||||
|
||||
__syncthreads();
|
||||
constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
|
||||
constexpr Index LOG2_DEGREE = params::log2_degree;
|
||||
constexpr Index HALF_DEGREE = params::degree >> 1;
|
||||
constexpr Index STRIDE = params::degree / params::opt;
|
||||
|
||||
f128x2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;
|
||||
|
||||
Index tid = threadIdx.x;
|
||||
|
||||
// debug
|
||||
__syncthreads();
|
||||
if (threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
printf("BUTTERFLY_DEPTH %d\n", BUTTERFLY_DEPTH);
|
||||
printf("LOG2_DEGREE %d\n", LOG2_DEGREE);
|
||||
printf("HALF_DEGREE %d\n", HALF_DEGREE);
|
||||
printf("STRIDE %d\n", STRIDE);
|
||||
printf("Params::degree %d\n", params::degree);
|
||||
printf("opt %d\n", params::opt);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// load into registers
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
|
||||
F64x4_TO_F128x2(u[i], tid);
|
||||
F64x4_TO_F128x2(v[i], tid + HALF_DEGREE);
|
||||
// u[i].re.hi = dt_re_hi[tid];
|
||||
// u[i].re.lo = dt_re_lo[tid];
|
||||
// u[i].im.hi = dt_im_hi[tid];
|
||||
// u[i].im.lo = dt_im_lo[tid];
|
||||
|
||||
// v[i].re.hi = dt_re_hi[tid + HALF_DEGREE];
|
||||
// v[i].re.lo = dt_re_lo[tid + HALF_DEGREE];
|
||||
// v[i].im.hi = dt_im_hi[tid + HALF_DEGREE];
|
||||
// v[i].im.lo = dt_im_lo[tid + HALF_DEGREE];
|
||||
|
||||
// F64x4_TO_F128x2(u[i], tid);
|
||||
// F64x4_TO_F128x2(v[i], tid + HALF_DEGREE);
|
||||
tid += STRIDE;
|
||||
}
|
||||
|
||||
// level 1
|
||||
// we don't make actual complex multiplication on level1 since we have only
|
||||
// one twiddle, it's real and image parts are equal, so we can multiply
|
||||
// it with simpler operations
|
||||
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
|
||||
auto ww = NEG_TWID(1);
|
||||
f128::cplx_f128_mul_assign(w.re, w.im, v[i].re, v[i].im, NEG_TWID(1).re,
|
||||
NEG_TWID(1).im);
|
||||
// w = v[i] * NEG_TWID(1);
|
||||
|
||||
// __syncthreads();
|
||||
// if (threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
// printf("w = %.5f %.5f %.5f %.5f\n", ww.re.hi, ww.re.lo, ww.im.hi,
|
||||
// ww.im.lo); printf("u = %.5f %.5f %.5f %.5f\n", u[i].re.hi,
|
||||
// u[i].re.lo, u[i].im.hi, u[i].im.lo); printf("v = %.5f %.5f %.5f
|
||||
// %.5f\n", v[i].re.hi, v[i].re.lo, v[i].im.hi, v[i].im.lo); printf("wv
|
||||
// = %.5f %.5f %.5f %.5f\n", w.re.hi, w.re.lo, w.im.hi, w.im.lo);
|
||||
// }
|
||||
// __syncthreads();
|
||||
// v[i] = u[i] - w;
|
||||
// u[i] = u[i] + w;
|
||||
|
||||
f128::cplx_f128_sub_assign(v[i].re, v[i].im, u[i].re, u[i].im, w.re, w.im);
|
||||
f128::cplx_f128_add_assign(u[i].re, u[i].im, u[i].re, u[i].im, w.re, w.im);
|
||||
}
|
||||
|
||||
// tid = threadIdx.x;
|
||||
// #pragma unroll
|
||||
// for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
|
||||
// F128x2_TO_F64x4(u[i], (tid));
|
||||
// F128x2_TO_F64x4(v[i], (tid + HALF_DEGREE));
|
||||
// tid = tid + STRIDE;
|
||||
// }
|
||||
// __syncthreads();
|
||||
|
||||
Index twiddle_shift = 1;
|
||||
int ii = 0;
|
||||
for (Index l = LOG2_DEGREE - 1; l >= 1; --l) {
|
||||
ii++;
|
||||
Index lane_mask = 1 << (l - 1);
|
||||
Index thread_mask = (1 << l) - 1;
|
||||
twiddle_shift <<= 1;
|
||||
|
||||
tid = threadIdx.x;
|
||||
__syncthreads();
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
|
||||
Index rank = tid & thread_mask;
|
||||
bool u_stays_in_register = rank < lane_mask;
|
||||
F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
|
||||
Index rank = tid & thread_mask;
|
||||
bool u_stays_in_register = rank < lane_mask;
|
||||
F64x4_TO_F128x2(w, tid ^ lane_mask);
|
||||
u[i] = (u_stays_in_register) ? u[i] : w;
|
||||
v[i] = (u_stays_in_register) ? w : v[i];
|
||||
w = NEG_TWID(tid / lane_mask + twiddle_shift);
|
||||
|
||||
// w *= v[i];
|
||||
f128::cplx_f128_mul_assign(w.re, w.im, v[i].re, v[i].im, w.re, w.im);
|
||||
f128::cplx_f128_sub_assign(v[i].re, v[i].im, u[i].re, u[i].im, w.re,
|
||||
w.im);
|
||||
f128::cplx_f128_add_assign(u[i].re, u[i].im, u[i].re, u[i].im, w.re,
|
||||
w.im);
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// store registers in SM
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
|
||||
F128x2_TO_F64x4(u[i], tid * 2);
|
||||
F128x2_TO_F64x4(v[i], (tid * 2 + 1));
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__device__ void negacyclic_inverse_fft_f128(double *dt_re_hi, double *dt_re_lo,
|
||||
double *dt_im_hi,
|
||||
double *dt_im_lo) {
|
||||
__syncthreads();
|
||||
constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
|
||||
constexpr Index LOG2_DEGREE = params::log2_degree;
|
||||
constexpr Index DEGREE = params::degree;
|
||||
constexpr Index HALF_DEGREE = params::degree >> 1;
|
||||
constexpr Index STRIDE = params::degree / params::opt;
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
f128x2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;
|
||||
|
||||
// load into registers and divide by compressed polynomial size
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
|
||||
|
||||
F64x4_TO_F128x2(u[i], 2 * tid);
|
||||
F64x4_TO_F128x2(v[i], 2 * tid + 1);
|
||||
|
||||
tid += STRIDE;
|
||||
}
|
||||
|
||||
Index twiddle_shift = DEGREE;
|
||||
for (Index l = 1; l <= LOG2_DEGREE - 1; ++l) {
|
||||
Index lane_mask = 1 << (l - 1);
|
||||
Index thread_mask = (1 << l) - 1;
|
||||
tid = threadIdx.x;
|
||||
twiddle_shift >>= 1;
|
||||
|
||||
// at this point registers are ready for the butterfly
|
||||
tid = threadIdx.x;
|
||||
__syncthreads();
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
|
||||
w = (u[i] - v[i]);
|
||||
u[i] += v[i];
|
||||
v[i] = w * NEG_TWID(tid / lane_mask + twiddle_shift).conjugate();
|
||||
|
||||
// keep one of the register for next iteration and store another one in sm
|
||||
Index rank = tid & thread_mask;
|
||||
bool u_stays_in_register = rank < lane_mask;
|
||||
F128x2_TO_F64x4((u_stays_in_register) ? v[i] : u[i], tid);
|
||||
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// prepare registers for next butterfly iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
|
||||
Index rank = tid & thread_mask;
|
||||
bool u_stays_in_register = rank < lane_mask;
|
||||
F64x4_TO_F128x2(w, tid ^ lane_mask);
|
||||
|
||||
u[i] = (u_stays_in_register) ? u[i] : w;
|
||||
v[i] = (u_stays_in_register) ? w : v[i];
|
||||
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
}
|
||||
|
||||
// last iteration
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
|
||||
w = (u[i] - v[i]);
|
||||
u[i] = u[i] + v[i];
|
||||
v[i] = w * NEG_TWID(1).conjugate();
|
||||
}
|
||||
__syncthreads();
|
||||
// store registers in SM
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
|
||||
F128x2_TO_F64x4(u[i], tid);
|
||||
F128x2_TO_F64x4(v[i], tid + HALF_DEGREE);
|
||||
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// params is expected to be full degree not half degree
|
||||
template <class params>
|
||||
__device__ void convert_u128_to_f128_as_integer(
|
||||
double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
|
||||
const __uint128_t *in_re, const __uint128_t *in_im) {
|
||||
|
||||
Index tid = threadIdx.x;
|
||||
// #pragma unroll
|
||||
for (Index i = 0; i < params::opt / 2; i++) {
|
||||
__syncthreads();
|
||||
auto out_re = u128_to_signed_to_f128(in_re[tid]);
|
||||
__syncthreads();
|
||||
auto out_im = u128_to_signed_to_f128(in_im[tid]);
|
||||
__syncthreads();
|
||||
|
||||
out_re_hi[tid] = out_re.hi;
|
||||
out_re_lo[tid] = out_re.lo;
|
||||
out_im_hi[tid] = out_im.hi;
|
||||
out_im_lo[tid] = out_im.lo;
|
||||
|
||||
// __syncthreads();
|
||||
// if (threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
// printf("%.5f %.5f %.5f %.5f\n", out_re_hi[tid], out_re_lo[tid],
|
||||
// out_im_hi[tid],
|
||||
// out_im_lo[tid]);
|
||||
// }
|
||||
// __syncthreads();
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
// params is expected to be full degree not half degree
|
||||
template <class params>
|
||||
__device__ void convert_u128_to_f128_as_torus(
|
||||
double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
|
||||
const __uint128_t *in_re, const __uint128_t *in_im) {
|
||||
|
||||
const double normalization = pow(2., -128.);
|
||||
Index tid = threadIdx.x;
|
||||
// #pragma unroll
|
||||
for (Index i = 0; i < params::opt / 2; i++) {
|
||||
__syncthreads();
|
||||
auto out_re = u128_to_signed_to_f128(in_re[tid]);
|
||||
__syncthreads();
|
||||
auto out_im = u128_to_signed_to_f128(in_im[tid]);
|
||||
__syncthreads();
|
||||
|
||||
out_re_hi[tid] = out_re.hi * normalization;
|
||||
out_re_lo[tid] = out_re.lo * normalization;
|
||||
out_im_hi[tid] = out_im.hi * normalization;
|
||||
out_im_lo[tid] = out_im.lo * normalization;
|
||||
|
||||
// __syncthreads();
|
||||
// if (threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
// printf("%.5f %.5f %.5f %.5f\n", out_re_hi[tid], out_re_lo[tid],
|
||||
// out_im_hi[tid],
|
||||
// out_im_lo[tid]);
|
||||
// }
|
||||
// __syncthreads();
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
// params is expected to be full degree not half degree
|
||||
template <class params>
|
||||
__global__ void
|
||||
batch_convert_u128_to_f128_as_integer(double *out_re_hi, double *out_re_lo,
|
||||
double *out_im_hi, double *out_im_lo,
|
||||
const __uint128_t *in) {
|
||||
|
||||
convert_u128_to_f128_as_integer<params>(
|
||||
&out_re_hi[blockIdx.x * params::degree / 2],
|
||||
&out_re_lo[blockIdx.x * params::degree / 2],
|
||||
&out_im_hi[blockIdx.x * params::degree / 2],
|
||||
&out_im_lo[blockIdx.x * params::degree / 2],
|
||||
&in[blockIdx.x * params::degree],
|
||||
&in[blockIdx.x * params::degree + params::degree / 2]);
|
||||
}
|
||||
|
||||
// params is expected to be full degree not half degree
|
||||
template <class params>
|
||||
__global__ void
|
||||
batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
|
||||
double *out_im_hi, double *out_im_lo,
|
||||
const __uint128_t *in) {
|
||||
|
||||
convert_u128_to_f128_as_torus<params>(
|
||||
&out_re_hi[blockIdx.x * params::degree / 2],
|
||||
&out_re_lo[blockIdx.x * params::degree / 2],
|
||||
&out_im_hi[blockIdx.x * params::degree / 2],
|
||||
&out_im_lo[blockIdx.x * params::degree / 2],
|
||||
&in[blockIdx.x * params::degree],
|
||||
&in[blockIdx.x * params::degree + params::degree / 2]);
|
||||
}
|
||||
|
||||
template <class params, sharedMemDegree SMD>
|
||||
__global__ void
|
||||
batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
|
||||
double *in_im_lo, double *out_re_hi, double *out_re_lo,
|
||||
double *out_im_hi, double *out_im_lo, double *buffer) {
|
||||
extern __shared__ double sharedMemoryFFT[];
|
||||
double *re_hi, *re_lo, *im_hi, *im_lo;
|
||||
|
||||
// debug
|
||||
__syncthreads();
|
||||
if (threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
printf("Params::degree %d\n", params::degree);
|
||||
printf("opt %d\n", params::opt);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (SMD == NOSM) {
|
||||
re_hi =
|
||||
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 0];
|
||||
re_lo =
|
||||
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 1];
|
||||
im_hi =
|
||||
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 2];
|
||||
im_lo =
|
||||
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 3];
|
||||
} else {
|
||||
re_hi = &sharedMemoryFFT[params::degree / 2 * 0];
|
||||
re_lo = &sharedMemoryFFT[params::degree / 2 * 1];
|
||||
im_hi = &sharedMemoryFFT[params::degree / 2 * 2];
|
||||
im_lo = &sharedMemoryFFT[params::degree / 2 * 3];
|
||||
}
|
||||
|
||||
Index tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < params::opt / 2; ++i) {
|
||||
re_hi[tid] = in_re_hi[blockIdx.x * (params::degree / 2) + tid];
|
||||
re_lo[tid] = in_re_lo[blockIdx.x * (params::degree / 2) + tid];
|
||||
im_hi[tid] = in_im_hi[blockIdx.x * (params::degree / 2) + tid];
|
||||
im_lo[tid] = in_im_lo[blockIdx.x * (params::degree / 2) + tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
negacyclic_forward_fft_f128<HalfDegree<params>>(re_hi, re_lo, im_hi, im_lo);
|
||||
__syncthreads();
|
||||
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < params::opt / 2; ++i) {
|
||||
out_re_hi[blockIdx.x * (params::degree / 2) + tid] = re_hi[tid];
|
||||
out_re_lo[blockIdx.x * (params::degree / 2) + tid] = re_lo[tid];
|
||||
out_im_hi[blockIdx.x * (params::degree / 2) + tid] = im_hi[tid];
|
||||
out_im_lo[blockIdx.x * (params::degree / 2) + tid] = im_lo[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
void print_uint128_bits(__uint128_t value) {
|
||||
char buffer[129]; // 128 bits + null terminator
|
||||
buffer[128] = '\0'; // Null-terminate the string
|
||||
|
||||
for (int i = 127; i >= 0; --i) {
|
||||
buffer[i] = (value & 1) ? '1' : '0'; // Extract the least significant bit
|
||||
value >>= 1; // Shift right by 1 bit
|
||||
}
|
||||
|
||||
printf("%s\n", buffer);
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__host__ void host_fourier_transform_forward_as_integer_f128(
|
||||
cudaStream_t stream, uint32_t gpu_index, double *re0, double *re1,
|
||||
double *im0, double *im1, const __uint128_t *standard, const uint32_t N,
|
||||
const uint32_t number_of_samples) {
|
||||
|
||||
// for (int i = 0; i < N / 2; i++)
|
||||
// {
|
||||
// printf("%.10f\n", re0[i]);
|
||||
// }
|
||||
// printf("cpp_poly_host\n");
|
||||
// for (int i = 0; i < N; i++) {
|
||||
// print_uint128_bits(standard[i]);
|
||||
// }
|
||||
// printf("check #1\n");
|
||||
|
||||
// for (int i = 0; i < 32; i++) {
|
||||
// standard[i + 32] = standard[i];
|
||||
// }
|
||||
|
||||
// allocate device buffers
|
||||
double *d_re0 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
double *d_re1 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
double *d_im0 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
double *d_im1 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
__uint128_t *d_standard = (__uint128_t *)cuda_malloc_async(
|
||||
N * sizeof(__uint128_t), stream, gpu_index);
|
||||
|
||||
// copy input into device
|
||||
cuda_memcpy_async_to_gpu(d_standard, standard, N * sizeof(__uint128_t),
|
||||
stream, gpu_index);
|
||||
|
||||
// setup launch parameters
|
||||
size_t required_shared_memory_size = sizeof(double) * N / 2 * 4;
|
||||
int grid_size = number_of_samples;
|
||||
int block_size = params::degree / params::opt;
|
||||
bool full_sm =
|
||||
(required_shared_memory_size <= cuda_get_max_shared_memory(gpu_index));
|
||||
size_t buffer_size = full_sm ? 0 : (size_t)number_of_samples * N / 2 * 4;
|
||||
size_t shared_memory_size = full_sm ? required_shared_memory_size : 0;
|
||||
double *buffer = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
|
||||
// configure shared memory for batch fft kernel
|
||||
if (full_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
}
|
||||
|
||||
// convert u128 into 4 x double
|
||||
batch_convert_u128_to_f128_as_integer<params>
|
||||
<<<grid_size, block_size, 0, stream>>>(d_re0, d_re1, d_im0, d_im1,
|
||||
d_standard);
|
||||
|
||||
// call negacyclic 128 bit forward fft.
|
||||
if (full_sm) {
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>
|
||||
<<<grid_size, block_size, shared_memory_size, stream>>>(
|
||||
d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
|
||||
} else {
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, NOSM>
|
||||
<<<grid_size, block_size, shared_memory_size, stream>>>(
|
||||
d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
|
||||
}
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// print_debug("re_hi", d_re0, 32);
|
||||
// print_debug("d_re_lo", d_re1, 32);
|
||||
// print_debug("d_im_hi", d_im0, 32);
|
||||
// print_debug("d_im_lo", d_im1, 32);
|
||||
|
||||
cuda_memcpy_async_to_cpu(re0, d_re0, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_cpu(re1, d_re1, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_cpu(im0, d_im0, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_cpu(im1, d_im1, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
|
||||
cuda_drop_async(d_standard, stream, gpu_index);
|
||||
cuda_drop_async(d_re0, stream, gpu_index);
|
||||
cuda_drop_async(d_re1, stream, gpu_index);
|
||||
cuda_drop_async(d_im0, stream, gpu_index);
|
||||
cuda_drop_async(d_im1, stream, gpu_index);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// printf("params::degree: %d\n", params::degree);
|
||||
// printf("params::opt: %d\n", params::opt);
|
||||
// printf("N: %d\n", N);
|
||||
// for (int i = 0; i < N; i++)
|
||||
// {
|
||||
// printf("%s\n", to_string_128(standard[i]).c_str());
|
||||
// }
|
||||
//
|
||||
// for (int i = 0; i < N / 2; i++) {
|
||||
//// auto re = u128_to_signed_to_f128(standard[i]);
|
||||
//// auto im = u128_to_signed_to_f128(standard[i + N / 2]);
|
||||
//// printf("%.10f %.10f %.10f %.10f\n", re.hi, re.lo, im.hi, im.lo);
|
||||
// printf("%.10f %.10f %.10f %.10f\n", re0[i], re1[i], im0[i], im1[i]);
|
||||
// }
|
||||
}
|
||||
|
||||
__global__ void print_twiddles(int N) {
|
||||
for (int i = 0; i < N / 2; i++) {
|
||||
printf("%.73f %.73f %.73f %.73f\n", neg_twiddles_re_hi[i],
|
||||
neg_twiddles_re_lo[i], neg_twiddles_im_hi[i], neg_twiddles_im_lo[i]);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void print_c128(double *re0, double *re1, double *im0, double *im1,
|
||||
int N) {
|
||||
for (int i = 0; i < N / 2; i++) {
|
||||
printf("%.73f %.73f %.73f %.73f\n", re0[i], re1[i], im0[i], im1[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__host__ void host_fourier_transform_forward_as_torus_f128(
|
||||
cudaStream_t stream, uint32_t gpu_index, double *re0, double *re1,
|
||||
double *im0, double *im1, const __uint128_t *standard, const uint32_t N,
|
||||
const uint32_t number_of_samples) {
|
||||
|
||||
print_twiddles<<<1, 1>>>(N);
|
||||
cudaDeviceSynchronize();
|
||||
// for (int i = 0; i < N / 2; i++)
|
||||
// {
|
||||
// printf("%.10f\n", re0[i]);
|
||||
// }
|
||||
// printf("cpp_poly_host\n");
|
||||
// for (int i = 0; i < N; i++) {
|
||||
// print_uint128_bits(standard[i]);
|
||||
// }
|
||||
// printf("check #1\n");
|
||||
|
||||
// for (int i = 0; i < 32; i++) {
|
||||
// standard[i + 32] = standard[i];
|
||||
// }
|
||||
|
||||
// allocate device buffers
|
||||
double *d_re0 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
double *d_re1 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
double *d_im0 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
double *d_im1 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
__uint128_t *d_standard = (__uint128_t *)cuda_malloc_async(
|
||||
N * sizeof(__uint128_t), stream, gpu_index);
|
||||
|
||||
// copy input into device
|
||||
cuda_memcpy_async_to_gpu(d_standard, standard, N * sizeof(__uint128_t),
|
||||
stream, gpu_index);
|
||||
|
||||
// setup launch parameters
|
||||
size_t required_shared_memory_size = sizeof(double) * N / 2 * 4;
|
||||
int grid_size = number_of_samples;
|
||||
int block_size = params::degree / params::opt;
|
||||
bool full_sm =
|
||||
(required_shared_memory_size <= cuda_get_max_shared_memory(gpu_index));
|
||||
size_t buffer_size = full_sm ? 0 : (size_t)number_of_samples * N / 2 * 4;
|
||||
size_t shared_memory_size = full_sm ? required_shared_memory_size : 0;
|
||||
double *buffer = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
|
||||
// configure shared memory for batch fft kernel
|
||||
if (full_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
}
|
||||
|
||||
// convert u128 into 4 x double
|
||||
batch_convert_u128_to_f128_as_torus<params>
|
||||
<<<grid_size, block_size, 0, stream>>>(d_re0, d_re1, d_im0, d_im1,
|
||||
d_standard);
|
||||
print_c128<<<1, 1>>>(d_re0, d_re1, d_im0, d_im1, N);
|
||||
cudaDeviceSynchronize();
|
||||
// call negacyclic 128 bit forward fft.
|
||||
if (full_sm) {
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>
|
||||
<<<grid_size, block_size, shared_memory_size, stream>>>(
|
||||
d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
|
||||
} else {
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, NOSM>
|
||||
<<<grid_size, block_size, shared_memory_size, stream>>>(
|
||||
d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
|
||||
}
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// print_debug("re_hi", d_re0, 32);
|
||||
// print_debug("d_re_lo", d_re1, 32);
|
||||
// print_debug("d_im_hi", d_im0, 32);
|
||||
// print_debug("d_im_lo", d_im1, 32);
|
||||
|
||||
cuda_memcpy_async_to_cpu(re0, d_re0, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_cpu(re1, d_re1, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_cpu(im0, d_im0, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_cpu(im1, d_im1, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
|
||||
cuda_drop_async(d_standard, stream, gpu_index);
|
||||
cuda_drop_async(d_re0, stream, gpu_index);
|
||||
cuda_drop_async(d_re1, stream, gpu_index);
|
||||
cuda_drop_async(d_im0, stream, gpu_index);
|
||||
cuda_drop_async(d_im1, stream, gpu_index);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// printf("params::degree: %d\n", params::degree);
|
||||
// printf("params::opt: %d\n", params::opt);
|
||||
// printf("N: %d\n", N);
|
||||
// for (int i = 0; i < N; i++)
|
||||
// {
|
||||
// printf("%s\n", to_string_128(standard[i]).c_str());
|
||||
// }
|
||||
|
||||
// for (int i = 0; i < N / 2; i++) {
|
||||
//// auto re = u128_to_signed_to_f128(standard[i]);
|
||||
//// auto im = u128_to_signed_to_f128(standard[i + N / 2]);
|
||||
//// printf("%.10f %.10f %.10f %.10f\n", re.hi, re.lo, im.hi, im.lo);
|
||||
// printf("%.10f %.10f %.10f %.10f\n", re0[i], re1[i], im0[i], im1[i]);
|
||||
// }
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__host__ void host_fourier_transform_backward_as_torus_f128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t *standard,
|
||||
double const *re0, double const *re1, double const *im0, double const *im1,
|
||||
const uint32_t N, const uint32_t number_of_samples) {
|
||||
|
||||
// allocate device buffers
|
||||
double *d_re0 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
double *d_re1 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
double *d_im0 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
double *d_im1 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
__uint128_t *d_standard = (__uint128_t *)cuda_malloc_async(
|
||||
N * sizeof(__uint128_t), stream, gpu_index);
|
||||
|
||||
// // copy input into device
|
||||
cuda_memcpy_async_to_gpu(d_re0, standard, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_re1, standard, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_im0, standard, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_im1, standard, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
|
||||
// setup launch parameters
|
||||
size_t required_shared_memory_size = sizeof(double) * N / 2 * 4;
|
||||
int grid_size = number_of_samples;
|
||||
int block_size = params::degree / params::opt;
|
||||
bool full_sm =
|
||||
(required_shared_memory_size <= cuda_get_max_shared_memory(gpu_index));
|
||||
size_t buffer_size = full_sm ? 0 : (size_t)number_of_samples * N / 2 * 4;
|
||||
size_t shared_memory_size = full_sm ? required_shared_memory_size : 0;
|
||||
double *buffer = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
|
||||
// configure shared memory for batch fft kernel
|
||||
if (full_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
}
|
||||
|
||||
// // convert u128 into 4 x double
|
||||
// batch_convert_u128_to_f128_as_torus<params><<<grid_size, block_size, 0,
|
||||
// stream>>>(
|
||||
// d_re0, d_re1, d_im0, d_im1, d_standard);
|
||||
|
||||
// call negacyclic 128 bit forward fft.
|
||||
// if (full_sm) {
|
||||
// negacyclic_inverse_fft_f128<FFTDegree<params, ForwardFFT>,
|
||||
// FULLSM><<<grid_size, block_size, shared_memory_size, stream>>>
|
||||
// (d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
|
||||
// } else {
|
||||
// batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, NOSM><<<grid_size,
|
||||
// block_size, shared_memory_size, stream>>>
|
||||
// (d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
|
||||
//
|
||||
// }
|
||||
//
|
||||
// cudaDeviceSynchronize();
|
||||
|
||||
//// print_debug("re_hi", d_re0, 32);
|
||||
//// print_debug("d_re_lo", d_re1, 32);
|
||||
//// print_debug("d_im_hi", d_im0, 32);
|
||||
//// print_debug("d_im_lo", d_im1, 32);
|
||||
//
|
||||
|
||||
cuda_memcpy_async_to_cpu(standard, d_standard, N * sizeof(__uint128_t),
|
||||
stream, gpu_index);
|
||||
cuda_drop_async(d_standard, stream, gpu_index);
|
||||
cuda_drop_async(d_re0, stream, gpu_index);
|
||||
cuda_drop_async(d_re1, stream, gpu_index);
|
||||
cuda_drop_async(d_im0, stream, gpu_index);
|
||||
cuda_drop_async(d_im1, stream, gpu_index);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
//
|
||||
// printf("params::degree: %d\n", params::degree);
|
||||
// printf("params::opt: %d\n", params::opt);
|
||||
// printf("N: %d\n", N);
|
||||
// for (int i = 0; i < N; i++)
|
||||
// {
|
||||
// printf("%s\n", to_string_128(standard[i]).c_str());
|
||||
// }
|
||||
//
|
||||
// for (int i = 0; i < N / 2; i++) {
|
||||
//// auto re = u128_to_signed_to_f128(standard[i]);
|
||||
//// auto im = u128_to_signed_to_f128(standard[i + N / 2]);
|
||||
//// printf("%.10f %.10f %.10f %.10f\n", re.hi, re.lo, im.hi, im.lo);
|
||||
// printf("%.10f %.10f %.10f %.10f\n", re0[i], re1[i], im0[i], im1[i]);
|
||||
// }
|
||||
//
|
||||
}
|
||||
|
||||
#endif // TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_FFT128_CUH_
|
||||
16387
backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu
Normal file
16387
backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu
Normal file
File diff suppressed because it is too large
Load Diff
11
backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cuh
Normal file
11
backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cuh
Normal file
@@ -0,0 +1,11 @@
|
||||
#ifndef GPU_BOOTSTRAP_128_TWIDDLES_CUH
|
||||
#define GPU_BOOTSTRAP_128_TWIDDLES_CUH
|
||||
|
||||
/*
|
||||
* 'negtwiddles' are stored in device memory to profit caching
|
||||
*/
|
||||
extern __device__ double neg_twiddles_re_hi[4096];
|
||||
extern __device__ double neg_twiddles_re_lo[4096];
|
||||
extern __device__ double neg_twiddles_im_hi[4096];
|
||||
extern __device__ double neg_twiddles_im_lo[4096];
|
||||
#endif
|
||||
43
backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
Normal file
43
backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
Normal file
@@ -0,0 +1,43 @@
|
||||
#include "integer/abs.cuh"
|
||||
|
||||
void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, bool is_signed, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_integer_abs_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_abs_buffer<uint64_t> **)mem_ptr, is_signed, num_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_blocks) {
|
||||
|
||||
auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_abs_kb<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
|
||||
gpu_count, static_cast<uint64_t *>(ct), bsks,
|
||||
(uint64_t **)(ksks), mem, is_signed,
|
||||
num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_abs_inplace(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_abs_buffer<uint64_t> *mem_ptr =
|
||||
(int_abs_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
69
backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
Normal file
69
backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
Normal file
@@ -0,0 +1,69 @@
|
||||
#ifndef TFHE_RS_ABS_CUH
|
||||
#define TFHE_RS_ABS_CUH
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer/bitwise_ops.cuh"
|
||||
#include "integer/comparison.cuh"
|
||||
#include "integer/integer.cuh"
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "integer/negation.cuh"
|
||||
#include "integer/scalar_shifts.cuh"
|
||||
#include "linear_algebra.h"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_abs_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_abs_buffer<Torus> **mem_ptr, bool is_signed,
|
||||
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
if (is_signed)
|
||||
*mem_ptr =
|
||||
new int_abs_buffer<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
num_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *ct, void *const *bsks,
|
||||
uint64_t *const *ksks, int_abs_buffer<uint64_t> *mem_ptr,
|
||||
bool is_signed, uint32_t num_blocks) {
|
||||
if (!is_signed)
|
||||
return;
|
||||
|
||||
auto radix_params = mem_ptr->params;
|
||||
auto mask = mem_ptr->mask;
|
||||
|
||||
auto big_lwe_dimension = radix_params.big_lwe_dimension;
|
||||
auto big_lwe_size = big_lwe_dimension + 1;
|
||||
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
uint32_t num_bits_in_ciphertext =
|
||||
(31 - __builtin_clz(radix_params.message_modulus)) * num_blocks;
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(mask, ct, num_blocks * big_lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, num_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
|
||||
radix_params.big_lwe_dimension, num_blocks);
|
||||
|
||||
host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count, ct,
|
||||
nullptr, nullptr, mem_ptr->scp_mem, bsks,
|
||||
ksks, num_blocks);
|
||||
|
||||
host_integer_radix_bitop_kb(streams, gpu_indexes, gpu_count, ct, mask, ct,
|
||||
mem_ptr->bitxor_mem, bsks, ksks, num_blocks);
|
||||
}
|
||||
|
||||
#endif // TFHE_RS_ABS_CUH
|
||||
@@ -127,6 +127,16 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
|
||||
|
||||
// phase 3
|
||||
auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size];
|
||||
if (op == SIGNED_OPERATION::SUBTRACTION && num_blocks == 1) {
|
||||
// Quick fix for the case where the subtraction is done on a single block
|
||||
Torus *one_scalar =
|
||||
(Torus *)cuda_malloc_async(sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
cuda_set_value_async<Torus>(streams[0], gpu_indexes[0], one_scalar, 1, 1);
|
||||
create_trivial_radix<Torus>(
|
||||
streams[0], gpu_indexes[0], input_carry, one_scalar, big_lwe_dimension,
|
||||
1, 1, radix_params.message_modulus, radix_params.carry_modulus);
|
||||
cuda_drop_async(one_scalar, streams[0], gpu_indexes[0]);
|
||||
}
|
||||
|
||||
host_resolve_signed_overflow<Torus>(
|
||||
streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation,
|
||||
|
||||
@@ -14,27 +14,14 @@ __host__ void zero_out_if(cudaStream_t const *streams,
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto params = mem_ptr->params;
|
||||
|
||||
int big_lwe_size = params.big_lwe_dimension + 1;
|
||||
|
||||
// Left message is shifted
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = (params.big_lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
|
||||
// We can't use integer_radix_apply_bivariate_lookup_table_kb since the
|
||||
// second operand is fixed
|
||||
// second operand is not an array
|
||||
auto tmp_lwe_array_input = mem_ptr->tmp;
|
||||
for (int i = 0; i < num_radix_blocks; i++) {
|
||||
auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
|
||||
auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;
|
||||
|
||||
device_pack_bivariate_blocks<Torus>
|
||||
<<<num_blocks, num_threads, 0, streams[0]>>>(
|
||||
lwe_array_out_block, predicate->lwe_indexes_in,
|
||||
lwe_array_input_block, lwe_condition, predicate->lwe_indexes_in,
|
||||
params.big_lwe_dimension, params.message_modulus, 1);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
pack_bivariate_blocks_with_single_block<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_lwe_array_input,
|
||||
predicate->lwe_indexes_in, lwe_array_input, lwe_condition,
|
||||
predicate->lwe_indexes_in, params.big_lwe_dimension,
|
||||
params.message_modulus, num_radix_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsks,
|
||||
@@ -56,10 +43,7 @@ __host__ void host_integer_radix_cmux_kb(
|
||||
auto true_streams = mem_ptr->zero_if_true_buffer->true_streams;
|
||||
auto false_streams = mem_ptr->zero_if_false_buffer->false_streams;
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
// cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
cudaEventRecord(mem_ptr->ingoing_events[j], streams[j]);
|
||||
cudaStreamWaitEvent(true_streams[j], mem_ptr->ingoing_events[j], 0);
|
||||
cudaStreamWaitEvent(false_streams[j], mem_ptr->ingoing_events[j], 0);
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
auto mem_true = mem_ptr->zero_if_true_buffer;
|
||||
@@ -67,29 +51,16 @@ __host__ void host_integer_radix_cmux_kb(
|
||||
lwe_array_true, lwe_condition, mem_true,
|
||||
mem_ptr->inverted_predicate_lut, bsks, ksks,
|
||||
num_radix_blocks);
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cudaEventRecord(mem_ptr->outgoing_events1[j], true_streams[j]);
|
||||
}
|
||||
|
||||
auto mem_false = mem_ptr->zero_if_false_buffer;
|
||||
zero_out_if<Torus>(false_streams, gpu_indexes, gpu_count,
|
||||
mem_ptr->tmp_false_ct, lwe_array_false, lwe_condition,
|
||||
mem_false, mem_ptr->predicate_lut, bsks, ksks,
|
||||
num_radix_blocks);
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cudaEventRecord(mem_ptr->outgoing_events2[j], false_streams[j]);
|
||||
for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
// for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
|
||||
// cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
|
||||
// }
|
||||
// for (uint j = 0; j < mem_ptr->zero_if_false_buffer->active_gpu_count; j++)
|
||||
// {
|
||||
// cuda_synchronize_stream(false_streams[j], gpu_indexes[j]);
|
||||
// }
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events1[j], 0);
|
||||
cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events2[j], 0);
|
||||
for (uint j = 0; j < mem_ptr->zero_if_false_buffer->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(false_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
// If the condition was true, true_ct will have kept its value and false_ct
|
||||
|
||||
@@ -2,11 +2,12 @@
|
||||
|
||||
void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
bool is_signed, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -14,7 +15,7 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_integer_div_rem_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, is_signed,
|
||||
(int_div_rem_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
@@ -22,7 +23,7 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *quotient, void *remainder, void const *numerator, void const *divisor,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
bool is_signed, int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
|
||||
@@ -31,8 +32,8 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
|
||||
static_cast<const uint64_t *>(numerator),
|
||||
static_cast<const uint64_t *>(divisor), bsks, (uint64_t **)(ksks), mem,
|
||||
num_blocks);
|
||||
static_cast<const uint64_t *>(divisor), is_signed, bsks,
|
||||
(uint64_t **)(ksks), mem, num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_div_rem(void *const *streams,
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer/abs.cuh"
|
||||
#include "integer/comparison.cuh"
|
||||
#include "integer/integer.cuh"
|
||||
#include "integer/integer_utilities.h"
|
||||
@@ -161,22 +162,21 @@ template <typename Torus> struct lwe_ciphertext_list {
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_div_rem_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_div_rem_memory<Torus> **mem_ptr,
|
||||
uint32_t gpu_count, bool is_signed, int_div_rem_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_div_rem_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
|
||||
*mem_ptr =
|
||||
new int_div_rem_memory<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
is_signed, num_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *quotient,
|
||||
Torus *remainder, Torus const *numerator,
|
||||
Torus const *divisor, void *const *bsks,
|
||||
uint64_t *const *ksks,
|
||||
int_div_rem_memory<uint64_t> *mem_ptr,
|
||||
uint32_t num_blocks) {
|
||||
__host__ void host_unsigned_integer_div_rem_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *quotient, Torus *remainder,
|
||||
Torus const *numerator, Torus const *divisor, void *const *bsks,
|
||||
uint64_t *const *ksks, unsigned_int_div_rem_memory<uint64_t> *mem_ptr,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
auto radix_params = mem_ptr->params;
|
||||
|
||||
@@ -375,16 +375,7 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
|
||||
}; // left_shift_interesting_remainder2
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cudaEventRecord(mem_ptr->ingoing_events1[j], streams[j]);
|
||||
// cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
cudaStreamWaitEvent(mem_ptr->sub_streams_1[j],
|
||||
mem_ptr->ingoing_events1[j], 0);
|
||||
cudaStreamWaitEvent(mem_ptr->sub_streams_2[j],
|
||||
mem_ptr->ingoing_events1[j], 0);
|
||||
cudaStreamWaitEvent(mem_ptr->sub_streams_3[j],
|
||||
mem_ptr->ingoing_events1[j], 0);
|
||||
cudaStreamWaitEvent(mem_ptr->sub_streams_4[j],
|
||||
mem_ptr->ingoing_events1[j], 0);
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
// interesting_divisor
|
||||
trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
|
||||
@@ -398,21 +389,11 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
|
||||
// interesting_remainder2
|
||||
left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
|
||||
gpu_count);
|
||||
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cudaEventRecord(mem_ptr->outgoing_events1[j], mem_ptr->sub_streams_1[j]);
|
||||
cudaEventRecord(mem_ptr->outgoing_events2[j], mem_ptr->sub_streams_2[j]);
|
||||
cudaEventRecord(mem_ptr->outgoing_events3[j], mem_ptr->sub_streams_3[j]);
|
||||
cudaEventRecord(mem_ptr->outgoing_events4[j], mem_ptr->sub_streams_4[j]);
|
||||
|
||||
// cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
// cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
// cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
// cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
|
||||
cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events1[j], 0);
|
||||
cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events2[j], 0);
|
||||
cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events3[j], 0);
|
||||
cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events4[j], 0);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
// if interesting_remainder1 != 0 -> interesting_remainder2 == 0
|
||||
@@ -444,16 +425,11 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
|
||||
auto do_overflowing_sub = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
uint32_t compute_borrow = 1;
|
||||
uint32_t uses_input_borrow = 0;
|
||||
mem_ptr->overflow_sub_mem->update_lut_indexes(
|
||||
streams, gpu_indexes, merged_interesting_remainder.len);
|
||||
host_integer_overflowing_sub<uint64_t>(
|
||||
host_integer_overflowing_sub_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, new_remainder.data,
|
||||
(uint64_t *)merged_interesting_remainder.data,
|
||||
interesting_divisor.data, subtraction_overflowed.data,
|
||||
(const Torus *)nullptr, mem_ptr->overflow_sub_mem, bsks, ksks,
|
||||
merged_interesting_remainder.len, compute_borrow, uses_input_borrow);
|
||||
subtraction_overflowed.data, merged_interesting_remainder.data,
|
||||
interesting_divisor.data, bsks, ksks, mem_ptr->overflow_sub_mem,
|
||||
merged_interesting_remainder.len);
|
||||
};
|
||||
|
||||
// fills:
|
||||
@@ -502,14 +478,7 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
|
||||
|
||||
// phase 2
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cudaEventRecord(mem_ptr->ingoing_events2[j], streams[j]);
|
||||
cudaStreamWaitEvent(mem_ptr->sub_streams_1[j],
|
||||
mem_ptr->ingoing_events2[j], 0);
|
||||
cudaStreamWaitEvent(mem_ptr->sub_streams_2[j],
|
||||
mem_ptr->ingoing_events2[j], 0);
|
||||
cudaStreamWaitEvent(mem_ptr->sub_streams_3[j],
|
||||
mem_ptr->ingoing_events2[j], 0);
|
||||
// cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
// new_remainder
|
||||
// subtraction_overflowed
|
||||
@@ -520,15 +489,9 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
|
||||
create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
|
||||
gpu_indexes, gpu_count);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cudaEventRecord(mem_ptr->outgoing_events5[j], mem_ptr->sub_streams_1[j]);
|
||||
cudaEventRecord(mem_ptr->outgoing_events6[j], mem_ptr->sub_streams_2[j]);
|
||||
cudaEventRecord(mem_ptr->outgoing_events7[j], mem_ptr->sub_streams_3[j]);
|
||||
cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events5[j], 0);
|
||||
cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events6[j], 0);
|
||||
cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events7[j], 0);
|
||||
// cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
// cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
// cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum.data,
|
||||
@@ -583,14 +546,7 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
|
||||
};
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cudaEventRecord(mem_ptr->ingoing_events3[j], streams[j]);
|
||||
cudaStreamWaitEvent(mem_ptr->sub_streams_1[j],
|
||||
mem_ptr->ingoing_events3[j], 0);
|
||||
cudaStreamWaitEvent(mem_ptr->sub_streams_2[j],
|
||||
mem_ptr->ingoing_events3[j], 0);
|
||||
cudaStreamWaitEvent(mem_ptr->sub_streams_3[j],
|
||||
mem_ptr->ingoing_events3[j], 0);
|
||||
// cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
// cleaned_merged_interesting_remainder
|
||||
conditionally_zero_out_merged_interesting_remainder(mem_ptr->sub_streams_1,
|
||||
@@ -601,15 +557,9 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
|
||||
// quotient
|
||||
set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cudaEventRecord(mem_ptr->outgoing_events8[j], mem_ptr->sub_streams_1[j]);
|
||||
cudaEventRecord(mem_ptr->outgoing_events9[j], mem_ptr->sub_streams_2[j]);
|
||||
cudaEventRecord(mem_ptr->outgoing_events10[j], mem_ptr->sub_streams_3[j]);
|
||||
cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events8[j], 0);
|
||||
cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events9[j], 0);
|
||||
cudaStreamWaitEvent(streams[j], mem_ptr->outgoing_events10[j], 0);
|
||||
// cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
// cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
// cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
assert(first_trivial_block - 1 == cleaned_merged_interesting_remainder.len);
|
||||
@@ -644,4 +594,105 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *quotient,
|
||||
Torus *remainder, Torus const *numerator,
|
||||
Torus const *divisor, bool is_signed,
|
||||
void *const *bsks, uint64_t *const *ksks,
|
||||
int_div_rem_memory<uint64_t> *int_mem_ptr,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
if (is_signed) {
|
||||
auto radix_params = int_mem_ptr->params;
|
||||
uint32_t big_lwe_size = radix_params.big_lwe_dimension + 1;
|
||||
|
||||
// temporary memory
|
||||
lwe_ciphertext_list<Torus> positive_numerator(
|
||||
int_mem_ptr->positive_numerator, radix_params, num_blocks);
|
||||
lwe_ciphertext_list<Torus> positive_divisor(int_mem_ptr->positive_divisor,
|
||||
radix_params, num_blocks);
|
||||
|
||||
positive_numerator.clone_from((Torus *)numerator, 0, num_blocks - 1,
|
||||
streams[0], gpu_indexes[0]);
|
||||
positive_divisor.clone_from((Torus *)divisor, 0, num_blocks - 1, streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_1, gpu_indexes,
|
||||
gpu_count, positive_numerator.data, bsks, ksks,
|
||||
int_mem_ptr->abs_mem_1, true, num_blocks);
|
||||
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, gpu_indexes,
|
||||
gpu_count, positive_divisor.data, bsks, ksks,
|
||||
int_mem_ptr->abs_mem_2, true, num_blocks);
|
||||
for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
host_unsigned_integer_div_rem_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, quotient, remainder,
|
||||
positive_numerator.data, positive_divisor.data, bsks, ksks,
|
||||
int_mem_ptr->unsigned_mem, num_blocks);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
|
||||
int_mem_ptr->sign_bits_are_different,
|
||||
&numerator[big_lwe_size * (num_blocks - 1)],
|
||||
&divisor[big_lwe_size * (num_blocks - 1)], bsks, ksks, 1,
|
||||
int_mem_ptr->compare_signed_bits_lut,
|
||||
int_mem_ptr->compare_signed_bits_lut->params.message_modulus);
|
||||
|
||||
for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
host_integer_radix_negation(
|
||||
int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
|
||||
int_mem_ptr->negated_quotient, quotient, radix_params.big_lwe_dimension,
|
||||
num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
|
||||
|
||||
host_propagate_single_carry<Torus>(int_mem_ptr->sub_streams_1, gpu_indexes,
|
||||
gpu_count, int_mem_ptr->negated_quotient,
|
||||
nullptr, nullptr, int_mem_ptr->scp_mem_1,
|
||||
bsks, ksks, num_blocks);
|
||||
|
||||
host_integer_radix_negation(int_mem_ptr->sub_streams_2, gpu_indexes,
|
||||
gpu_count, int_mem_ptr->negated_remainder,
|
||||
remainder, radix_params.big_lwe_dimension,
|
||||
num_blocks, radix_params.message_modulus,
|
||||
radix_params.carry_modulus);
|
||||
|
||||
host_propagate_single_carry<Torus>(
|
||||
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
|
||||
int_mem_ptr->negated_remainder, nullptr, nullptr,
|
||||
int_mem_ptr->scp_mem_2, bsks, ksks, num_blocks);
|
||||
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, quotient,
|
||||
int_mem_ptr->sign_bits_are_different, int_mem_ptr->negated_quotient,
|
||||
quotient, int_mem_ptr->cmux_quotient_mem, bsks, ksks, num_blocks);
|
||||
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count, remainder,
|
||||
&numerator[big_lwe_size * (num_blocks - 1)],
|
||||
int_mem_ptr->negated_remainder, remainder,
|
||||
int_mem_ptr->cmux_remainder_mem, bsks, ksks, num_blocks);
|
||||
|
||||
for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
} else {
|
||||
host_unsigned_integer_div_rem_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, quotient, remainder, numerator,
|
||||
divisor, bsks, ksks, int_mem_ptr->unsigned_mem, num_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // TFHE_RS_DIV_REM_CUH
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
#include "integer/integer.cuh"
|
||||
#include "integer/negation.cuh"
|
||||
#include <linear_algebra.h>
|
||||
|
||||
void cuda_full_propagation_64_inplace(void *const *streams,
|
||||
@@ -63,46 +62,6 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void scratch_cuda_fast_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
|
||||
uint32_t uses_carry, bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_fast_propagate_single_carry_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_fast_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
requested_flag, uses_carry, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_integer_overflowing_sub<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_fast_borrow_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
compute_overflow, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks,
|
||||
@@ -114,37 +73,6 @@ void cuda_propagate_single_carry_kb_64_inplace(
|
||||
(uint64_t **)(ksks), num_blocks);
|
||||
}
|
||||
|
||||
void cuda_fast_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks, uint32_t num_blocks,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
|
||||
host_fast_propagate_single_carry<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
|
||||
static_cast<const uint64_t *>(carry_in),
|
||||
(int_fast_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
num_blocks, requested_flag, uses_carry);
|
||||
}
|
||||
|
||||
void cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lhs_array, const void *rhs_array, void *overflow_block,
|
||||
const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
|
||||
uint32_t uses_input_borrow) {
|
||||
|
||||
host_integer_overflowing_sub<uint64_t>(
|
||||
(cudaStream_t const *)streams, gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lhs_array), static_cast<uint64_t *>(lhs_array),
|
||||
static_cast<const uint64_t *>(rhs_array),
|
||||
static_cast<uint64_t *>(overflow_block),
|
||||
static_cast<const uint64_t *>(input_borrow),
|
||||
(int_fast_borrow_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
|
||||
num_blocks, compute_overflow, uses_input_borrow);
|
||||
}
|
||||
|
||||
void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr,
|
||||
@@ -166,24 +94,6 @@ void cleanup_cuda_propagate_single_carry(void *const *streams,
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
void cleanup_cuda_fast_propagate_single_carry(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_fast_sc_prop_memory<uint64_t> *mem_ptr =
|
||||
(int_fast_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_overflowing_sub(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_fast_borrow_prop_memory<uint64_t> *mem_ptr =
|
||||
(int_fast_borrow_prop_memory<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
void scratch_cuda_apply_univariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
#include "integer/scalar_addition.cuh"
|
||||
#include "linear_algebra.h"
|
||||
#include "linearalgebra/addition.cuh"
|
||||
#include "linearalgebra/negation.cuh"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
@@ -129,134 +128,6 @@ host_radix_blocks_reverse_inplace(cudaStream_t const *streams,
|
||||
<<<num_blocks, num_threads, 0, streams[0]>>>(src, blocks_count, lwe_size);
|
||||
}
|
||||
|
||||
// If group_size = 4, the first group of 4 elements will be transformed as
|
||||
// follows:
|
||||
// dest[0] = src[0]
|
||||
// dest[1] = src[0] + src[1]
|
||||
// dest[2] = src[0] + src[1] + src[2]
|
||||
// dest[3] = src[0] + src[1] + src[2] + src[3]
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
radix_cumulative_sum_in_groups(Torus *dest, Torus *src, uint32_t blocks_count,
|
||||
uint32_t lwe_size, uint32_t group_size) {
|
||||
|
||||
size_t block_offset = blockIdx.x * group_size * lwe_size;
|
||||
|
||||
for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
|
||||
size_t idx = j + block_offset;
|
||||
Torus sum = src[idx];
|
||||
dest[idx] = sum;
|
||||
for (int gidx = 1; gidx < group_size; gidx++) {
|
||||
if (gidx + blockIdx.x * group_size <
|
||||
blocks_count) { // in case the last group is not full
|
||||
sum += src[idx + gidx * lwe_size];
|
||||
dest[idx + gidx * lwe_size] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_radix_cumulative_sum_in_groups(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *dest, Torus *src,
|
||||
uint32_t radix_blocks_count, uint32_t lwe_size, uint32_t group_size) {
|
||||
cudaSetDevice(gpu_index);
|
||||
// Each CUDA block is responsible for a single group
|
||||
int num_blocks = (radix_blocks_count + group_size - 1) / group_size,
|
||||
num_threads = 512;
|
||||
radix_cumulative_sum_in_groups<Torus><<<num_blocks, num_threads, 0, stream>>>(
|
||||
dest, src, radix_blocks_count, lwe_size, group_size);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void radix_split_simulators_and_grouping_pgns(
|
||||
Torus *simulators, Torus *grouping_pgns, Torus *src, uint32_t blocks_count,
|
||||
uint32_t lwe_size, uint32_t group_size, Torus delta) {
|
||||
|
||||
size_t block_offset = blockIdx.x * lwe_size;
|
||||
if (blockIdx.x % group_size == 0) {
|
||||
if (blockIdx.x == 0) {
|
||||
// save trivial 0
|
||||
for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
|
||||
simulators[j] = 0;
|
||||
}
|
||||
} else {
|
||||
// save trivial 1
|
||||
for (int j = threadIdx.x; j < lwe_size - 1; j += blockDim.x) {
|
||||
size_t simu_idx = j + block_offset;
|
||||
simulators[simu_idx] = 0;
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
simulators[lwe_size - 1 + block_offset] = 1 * delta;
|
||||
}
|
||||
}
|
||||
|
||||
if ((blockIdx.x / group_size + 1) <
|
||||
(blocks_count + group_size - 1) / group_size) {
|
||||
size_t src_offset = (blockIdx.x + group_size - 1) * lwe_size;
|
||||
size_t pgns_offset = (blockIdx.x / group_size) * lwe_size;
|
||||
for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
|
||||
size_t in_offset = j + src_offset;
|
||||
size_t out_offset = j + pgns_offset;
|
||||
grouping_pgns[out_offset] = src[in_offset];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// save simulators
|
||||
size_t src_offset = (blockIdx.x - 1) * lwe_size;
|
||||
for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
|
||||
simulators[j + block_offset] = src[j + src_offset];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_radix_split_simulators_and_grouping_pgns(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *simulators,
|
||||
Torus *grouping_pgns, Torus *src, uint32_t radix_blocks_count,
|
||||
uint32_t lwe_size, uint32_t group_size, Torus delta) {
|
||||
cudaSetDevice(gpu_index);
|
||||
// Each CUDA block is responsible for a single group
|
||||
int num_blocks = radix_blocks_count, num_threads = 512;
|
||||
radix_split_simulators_and_grouping_pgns<Torus>
|
||||
<<<num_blocks, num_threads, 0, stream>>>(simulators, grouping_pgns, src,
|
||||
radix_blocks_count, lwe_size,
|
||||
group_size, delta);
|
||||
}
|
||||
|
||||
// If group_size = 4, the first group of 4 elements will be transformed as
|
||||
// follows:
|
||||
// src1 size num_radix_blocks * lwe_size
|
||||
// src2 size num_group * lwe_size
|
||||
// dest[0] = src1[0] + src2[0]
|
||||
// dest[1] = src1[1] + src2[0]
|
||||
// dest[2] = src1[2] + src2[0]
|
||||
// dest[3] = src1[3] + src2[0]
|
||||
template <typename Torus>
|
||||
__global__ void radix_sum_in_groups(Torus *dest, Torus *src1, Torus *src2,
|
||||
uint32_t blocks_count, uint32_t lwe_size,
|
||||
uint32_t group_size) {
|
||||
|
||||
size_t src1_offset = blockIdx.x * lwe_size;
|
||||
size_t src2_index = (blockIdx.x / group_size) * lwe_size;
|
||||
for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
|
||||
size_t idx = j + src1_offset;
|
||||
dest[idx] = src1[idx] + src2[j + src2_index];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_radix_sum_in_groups(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *dest, Torus *src1, Torus *src2,
|
||||
uint32_t radix_blocks_count,
|
||||
uint32_t lwe_size, uint32_t group_size) {
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
int num_blocks = radix_blocks_count, num_threads = 512;
|
||||
radix_sum_in_groups<Torus><<<num_blocks, num_threads, 0, stream>>>(
|
||||
dest, src1, src2, radix_blocks_count, lwe_size, group_size);
|
||||
}
|
||||
|
||||
// polynomial_size threads
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
@@ -271,8 +142,10 @@ device_pack_bivariate_blocks(Torus *lwe_array_out, Torus const *lwe_indexes_out,
|
||||
int block_id = tid / (lwe_dimension + 1);
|
||||
int coeff_id = tid % (lwe_dimension + 1);
|
||||
|
||||
int pos_in = lwe_indexes_in[block_id] * (lwe_dimension + 1) + coeff_id;
|
||||
int pos_out = lwe_indexes_out[block_id] * (lwe_dimension + 1) + coeff_id;
|
||||
const int pos_in =
|
||||
lwe_indexes_in[block_id] * (lwe_dimension + 1) + coeff_id;
|
||||
const int pos_out =
|
||||
lwe_indexes_out[block_id] * (lwe_dimension + 1) + coeff_id;
|
||||
lwe_array_out[pos_out] = lwe_array_1[pos_in] * shift + lwe_array_2[pos_in];
|
||||
}
|
||||
}
|
||||
@@ -301,6 +174,50 @@ pack_bivariate_blocks(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
// polynomial_size threads
|
||||
template <typename Torus>
|
||||
__global__ void device_pack_bivariate_blocks_with_single_block(
|
||||
Torus *lwe_array_out, Torus const *lwe_indexes_out,
|
||||
Torus const *lwe_array_1, Torus const *lwe_2, Torus const *lwe_indexes_in,
|
||||
uint32_t lwe_dimension, uint32_t shift, uint32_t num_blocks) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
if (tid < num_blocks * (lwe_dimension + 1)) {
|
||||
int block_id = tid / (lwe_dimension + 1);
|
||||
int coeff_id = tid % (lwe_dimension + 1);
|
||||
|
||||
const int pos_in =
|
||||
lwe_indexes_in[block_id] * (lwe_dimension + 1) + coeff_id;
|
||||
const int pos_out =
|
||||
lwe_indexes_out[block_id] * (lwe_dimension + 1) + coeff_id;
|
||||
lwe_array_out[pos_out] = lwe_array_1[pos_in] * shift + lwe_2[coeff_id];
|
||||
}
|
||||
}
|
||||
|
||||
/* Combine lwe_array_1 and lwe_2 so that each block m1 and lwe_2
|
||||
* becomes out = m1 * shift + lwe_2
|
||||
*
|
||||
* This is for the special case when one of the operands is not an array
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void pack_bivariate_blocks_with_single_block(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_indexes_out,
|
||||
Torus const *lwe_array_1, Torus const *lwe_2, Torus const *lwe_indexes_in,
|
||||
uint32_t lwe_dimension, uint32_t shift, uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
// Left message is shifted
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = num_radix_blocks * (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
device_pack_bivariate_blocks_with_single_block<Torus>
|
||||
<<<num_blocks, num_threads, 0, streams[0]>>>(
|
||||
lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_2, lwe_indexes_in,
|
||||
lwe_dimension, shift, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -608,48 +525,6 @@ void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
|
||||
rotate_left<Torus>(body, half_box_size, polynomial_size);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void generate_many_lookup_table(
|
||||
Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t message_modulus, uint32_t carry_modulus,
|
||||
std::vector<std::function<Torus(Torus)>> &functions) {
|
||||
|
||||
uint32_t modulus_sup = message_modulus * carry_modulus;
|
||||
uint32_t box_size = polynomial_size / modulus_sup;
|
||||
Torus delta = (1ul << 63) / modulus_sup;
|
||||
|
||||
memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));
|
||||
|
||||
auto body = &acc[glwe_dimension * polynomial_size];
|
||||
|
||||
size_t fn_counts = functions.size();
|
||||
|
||||
assert(fn_counts <= modulus_sup / 2);
|
||||
|
||||
// Space used for each sub lut
|
||||
uint32_t single_function_sub_lut_size = (modulus_sup / fn_counts) * box_size;
|
||||
|
||||
// This accumulator extracts the carry bits
|
||||
for (int f = 0; f < fn_counts; f++) {
|
||||
int lut_offset = f * single_function_sub_lut_size;
|
||||
for (int i = 0; i < modulus_sup / fn_counts; i++) {
|
||||
int index = i * box_size + lut_offset;
|
||||
for (int j = index; j < index + box_size; j++) {
|
||||
auto f_eval = functions[f](i);
|
||||
body[j] = f_eval * delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
int half_box_size = box_size / 2;
|
||||
|
||||
// Negate the first half_box_size coefficients
|
||||
for (int i = 0; i < half_box_size; i++) {
|
||||
body[i] = -body[i];
|
||||
}
|
||||
|
||||
rotate_left<Torus>(body, half_box_size, polynomial_size);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
@@ -783,37 +658,6 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
|
||||
free(h_lut);
|
||||
}
|
||||
|
||||
/*
|
||||
* generate many lut accumulator for device pointer
|
||||
* v_stream - cuda stream
|
||||
* acc - device pointer for accumulator
|
||||
* ...
|
||||
* vector<f> - evaluating functions with one Torus input
|
||||
*/
|
||||
template <typename Torus>
|
||||
void generate_many_lut_device_accumulator(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *acc,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
|
||||
uint32_t carry_modulus,
|
||||
std::vector<std::function<Torus(Torus)>> &functions) {
|
||||
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
|
||||
// fill accumulator
|
||||
generate_many_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, functions);
|
||||
|
||||
// copy host lut and lut_indexes_vec to device
|
||||
cuda_memcpy_async_to_gpu(
|
||||
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
|
||||
stream, gpu_index);
|
||||
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_propagate_single_carry_kb_inplace(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -826,108 +670,6 @@ void scratch_cuda_propagate_single_carry_kb_inplace(
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_compute_shifted_blocks_and_states(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array, int_radix_params params,
|
||||
int_shifted_blocks_and_states_memory<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_blocks, uint32_t lut_stride,
|
||||
uint32_t lut_count) {
|
||||
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
|
||||
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
auto shifted_blocks_and_states = mem->shifted_blocks_and_states;
|
||||
auto luts_array_first_step = mem->luts_array_first_step;
|
||||
|
||||
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, shifted_blocks_and_states, lwe_array,
|
||||
bsks, ksks, num_blocks, luts_array_first_step, lut_count, lut_stride);
|
||||
|
||||
auto shifted_blocks = mem->shifted_blocks;
|
||||
auto block_states = mem->block_states;
|
||||
cuda_memcpy_async_gpu_to_gpu(block_states, shifted_blocks_and_states,
|
||||
big_lwe_size_bytes * num_blocks, streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
shifted_blocks, shifted_blocks_and_states + big_lwe_size * num_blocks,
|
||||
big_lwe_size_bytes * num_blocks, streams[0], gpu_indexes[0]);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_resolve_group_carries_sequentially(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *resolved_carries, Torus *grouping_pgns,
|
||||
int_radix_params params, int_seq_group_prop_memory<Torus> *mem,
|
||||
void *const *bsks, Torus *const *ksks, uint32_t num_groups) {
|
||||
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
|
||||
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
auto group_resolved_carries = mem->group_resolved_carries;
|
||||
if (num_groups > 1) {
|
||||
// First carry is just copied
|
||||
cuda_memcpy_async_gpu_to_gpu(resolved_carries + big_lwe_size, grouping_pgns,
|
||||
big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
uint32_t solve_per_iter = mem->grouping_size - 1;
|
||||
uint32_t remaining_carries =
|
||||
num_groups -
|
||||
2; // the first one has been resolved and we ignore the last one
|
||||
uint32_t num_loops =
|
||||
ceil(double(remaining_carries) / (double)(solve_per_iter));
|
||||
uint32_t last_resolved_pos = 1;
|
||||
|
||||
for (int i = 0; i < num_loops; i++) {
|
||||
uint32_t loop_offset = i * solve_per_iter;
|
||||
uint32_t blocks_to_solve = solve_per_iter;
|
||||
// In case the last iteration has to solve less
|
||||
if (loop_offset + blocks_to_solve > num_groups - 2) {
|
||||
blocks_to_solve = remaining_carries - loop_offset;
|
||||
}
|
||||
|
||||
// The group_resolved carries is used as an intermediate array
|
||||
// First we need to copy the last resolved carry
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
group_resolved_carries,
|
||||
resolved_carries + last_resolved_pos * big_lwe_size,
|
||||
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
|
||||
// The array is filled with the blocks_to_solve
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
group_resolved_carries + big_lwe_size,
|
||||
grouping_pgns + last_resolved_pos * big_lwe_size,
|
||||
blocks_to_solve * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
|
||||
// Perform one group cumulative sum
|
||||
host_radix_cumulative_sum_in_groups<Torus>(
|
||||
streams[0], gpu_indexes[0], group_resolved_carries,
|
||||
group_resolved_carries, blocks_to_solve + 1, big_lwe_size,
|
||||
mem->grouping_size);
|
||||
|
||||
// Apply the lut
|
||||
auto luts_sequential = mem->lut_sequential_algorithm;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
group_resolved_carries + big_lwe_size,
|
||||
group_resolved_carries + big_lwe_size, bsks, ksks, blocks_to_solve,
|
||||
luts_sequential);
|
||||
|
||||
// Copy the result to the resolved carries array
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
resolved_carries + (last_resolved_pos + 1) * big_lwe_size,
|
||||
group_resolved_carries + big_lwe_size,
|
||||
blocks_to_solve * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
|
||||
last_resolved_pos += blocks_to_solve;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_compute_prefix_sum_hillis_steele(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -965,95 +707,6 @@ void host_compute_prefix_sum_hillis_steele(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_compute_propagation_simulators_and_group_carries(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *block_states, int_radix_params params,
|
||||
int_prop_simu_group_carries_memory<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_blocks, uint32_t num_groups) {
|
||||
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
|
||||
|
||||
auto propagation_cum_sums = mem->propagation_cum_sums;
|
||||
auto group_size = mem->group_size;
|
||||
host_radix_cumulative_sum_in_groups<Torus>(
|
||||
streams[0], gpu_indexes[0], propagation_cum_sums, block_states,
|
||||
num_blocks, big_lwe_size, group_size);
|
||||
|
||||
auto luts_array_second_step = mem->luts_array_second_step;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, propagation_cum_sums,
|
||||
propagation_cum_sums, bsks, ksks, num_blocks, luts_array_second_step);
|
||||
|
||||
auto scalar_array_cum_sum = mem->scalar_array_cum_sum;
|
||||
auto big_lwe_dimension = big_lwe_size - 1;
|
||||
|
||||
host_integer_radix_scalar_addition_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, propagation_cum_sums,
|
||||
scalar_array_cum_sum, big_lwe_dimension, num_blocks, message_modulus,
|
||||
carry_modulus);
|
||||
|
||||
uint32_t modulus_sup = message_modulus * carry_modulus;
|
||||
Torus delta = (1ull << 63) / modulus_sup;
|
||||
auto simulators = mem->simulators;
|
||||
auto grouping_pgns = mem->grouping_pgns;
|
||||
host_radix_split_simulators_and_grouping_pgns<Torus>(
|
||||
streams[0], gpu_indexes[0], simulators, grouping_pgns,
|
||||
propagation_cum_sums, num_blocks, big_lwe_size, group_size, delta);
|
||||
|
||||
auto resolved_carries = mem->resolved_carries;
|
||||
if (mem->use_sequential_algorithm_to_resolver_group_carries) {
|
||||
// Resolve group carries sequentially
|
||||
host_resolve_group_carries_sequentially(
|
||||
streams, gpu_indexes, gpu_count, resolved_carries, grouping_pgns,
|
||||
params, mem->seq_group_prop_mem, bsks, ksks, num_groups);
|
||||
} else {
|
||||
// Resolve group carries with hillis steele
|
||||
auto luts_carry_propagation_sum = mem->hs_group_prop_mem->lut_hillis_steele;
|
||||
host_compute_prefix_sum_hillis_steele<Torus>(
|
||||
streams, gpu_indexes, gpu_count, &resolved_carries[big_lwe_size],
|
||||
grouping_pgns, params, luts_carry_propagation_sum, bsks, ksks,
|
||||
num_groups - 1);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_compute_shifted_blocks_and_borrow_states(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array, int_radix_params params,
|
||||
int_shifted_blocks_and_borrow_states_memory<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_blocks, uint32_t lut_stride,
|
||||
uint32_t lut_count) {
|
||||
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
|
||||
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
auto shifted_blocks_and_borrow_states = mem->shifted_blocks_and_borrow_states;
|
||||
auto luts_array_first_step = mem->luts_array_first_step;
|
||||
|
||||
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, shifted_blocks_and_borrow_states,
|
||||
lwe_array, bsks, ksks, num_blocks, luts_array_first_step, lut_count,
|
||||
lut_stride);
|
||||
|
||||
auto shifted_blocks = mem->shifted_blocks;
|
||||
auto borrow_states = mem->borrow_states;
|
||||
cuda_memcpy_async_gpu_to_gpu(borrow_states, shifted_blocks_and_borrow_states,
|
||||
big_lwe_size_bytes * num_blocks, streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
shifted_blocks,
|
||||
shifted_blocks_and_borrow_states + big_lwe_size * num_blocks,
|
||||
big_lwe_size_bytes * num_blocks, streams[0], gpu_indexes[0]);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_propagate_single_carry(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
@@ -1539,247 +1192,4 @@ void host_apply_bivariate_lut_kb(
|
||||
radix_lwe_in_2, bsks, ksks, num_blocks, mem, shift);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_fast_propagate_single_carry_kb_inplace(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_fast_sc_prop_memory<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, uint32_t requested_flag,
|
||||
uint32_t uses_carry, bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_fast_sc_prop_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag,
|
||||
uses_carry, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_fast_propagate_single_carry(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array, Torus *carry_out,
|
||||
const Torus *input_carries, int_fast_sc_prop_memory<Torus> *mem,
|
||||
void *const *bsks, Torus *const *ksks, uint32_t num_blocks,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
|
||||
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
auto big_lwe_dimension = big_lwe_size - 1; // For host addition
|
||||
auto lut_stride = mem->lut_stride;
|
||||
auto lut_count = mem->lut_count;
|
||||
|
||||
enum outputFlag { NONE = 0, OVERFLOW = 1, CARRY = 2 };
|
||||
if (uses_carry == 1) {
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
|
||||
input_carries, big_lwe_dimension, 1);
|
||||
}
|
||||
|
||||
host_compute_shifted_blocks_and_states<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, params,
|
||||
mem->shifted_blocks_state_mem, bsks, ksks, num_blocks, lut_stride,
|
||||
lut_count);
|
||||
auto block_states = mem->shifted_blocks_state_mem->block_states;
|
||||
if (requested_flag == outputFlag::OVERFLOW) {
|
||||
// This operation could be added to the many lut with some trickery to be in
|
||||
// parallel but first i will try to use different streams
|
||||
auto lut_overflow_prep = mem->lut_overflow_flag_prep;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem->output_flag,
|
||||
lwe_array + (num_blocks - 1) * big_lwe_size, bsks, ksks, 1,
|
||||
lut_overflow_prep);
|
||||
} else if (requested_flag == outputFlag::CARRY) {
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
mem->output_flag, block_states + (num_blocks - 1) * big_lwe_size,
|
||||
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
}
|
||||
|
||||
host_compute_propagation_simulators_and_group_carries<Torus>(
|
||||
streams, gpu_indexes, gpu_count, block_states, params,
|
||||
mem->prop_simu_group_carries_mem, bsks, ksks, num_blocks,
|
||||
mem->num_groups);
|
||||
|
||||
auto group_size = mem->prop_simu_group_carries_mem->group_size;
|
||||
|
||||
auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks;
|
||||
auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks;
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], prepared_blocks,
|
||||
shifted_blocks,
|
||||
mem->prop_simu_group_carries_mem->simulators,
|
||||
big_lwe_dimension, num_blocks);
|
||||
|
||||
if (requested_flag == outputFlag::OVERFLOW ||
|
||||
requested_flag == outputFlag::CARRY) {
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], mem->output_flag,
|
||||
mem->output_flag,
|
||||
mem->prop_simu_group_carries_mem->simulators +
|
||||
(num_blocks - 1) * big_lwe_size,
|
||||
big_lwe_dimension, 1);
|
||||
}
|
||||
|
||||
for (uint j = 0; j < mem->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
// Add carries and cleanup OutputFlag::None
|
||||
host_radix_sum_in_groups<Torus>(
|
||||
mem->sub_streams_1[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
|
||||
mem->prop_simu_group_carries_mem->resolved_carries, num_blocks,
|
||||
big_lwe_size, group_size);
|
||||
|
||||
auto message_extract = mem->lut_message_extract;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->sub_streams_1, gpu_indexes, gpu_count, lwe_array, prepared_blocks,
|
||||
bsks, ksks, num_blocks, message_extract);
|
||||
|
||||
if (requested_flag == outputFlag::OVERFLOW ||
|
||||
requested_flag == outputFlag::CARRY) {
|
||||
// Here I could also do some trick to try to apply this function in parallel
|
||||
// First i will try sequential, then i improve it
|
||||
|
||||
host_addition<Torus>(mem->sub_streams_2[0], gpu_indexes[0],
|
||||
mem->output_flag, mem->output_flag,
|
||||
mem->prop_simu_group_carries_mem->resolved_carries +
|
||||
(mem->num_groups - 1) * big_lwe_size,
|
||||
big_lwe_dimension, 1);
|
||||
|
||||
if (requested_flag == outputFlag::OVERFLOW) {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag,
|
||||
mem->output_flag, bsks, ksks, 1, mem->lut_overflow_flag_last);
|
||||
} else {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag,
|
||||
mem->output_flag, bsks, ksks, 1, mem->lut_carry_flag_last);
|
||||
}
|
||||
for (uint j = 0; j < mem->active_gpu_count; j++) {
|
||||
cuda_memcpy_async_gpu_to_gpu(carry_out, mem->output_flag,
|
||||
big_lwe_size_bytes, mem->sub_streams_2[j],
|
||||
gpu_indexes[j]);
|
||||
}
|
||||
}
|
||||
|
||||
for (uint j = 0; j < mem->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem->sub_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_integer_overflowing_sub(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_fast_borrow_prop_memory<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
uint32_t compute_overflow, bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_fast_borrow_prop_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
compute_overflow, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_fast_borrow_propagate(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lhsrhs_array, Torus *overflow_block,
|
||||
const Torus *input_borrow,
|
||||
int_fast_borrow_prop_memory<Torus> *mem,
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
uint32_t num_blocks, uint32_t num_groups,
|
||||
uint32_t compute_overflow,
|
||||
uint32_t uses_input_borrow) {
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
|
||||
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
auto big_lwe_dimension = big_lwe_size - 1;
|
||||
auto lut_stride = mem->lut_stride;
|
||||
auto lut_count = mem->lut_count;
|
||||
|
||||
assert(mem->num_groups >= num_groups);
|
||||
if (uses_input_borrow == 1) {
|
||||
host_unchecked_sub_with_correcting_term<Torus>(
|
||||
streams[0], gpu_indexes[0], lhsrhs_array, lhsrhs_array, input_borrow,
|
||||
big_lwe_dimension, 1, message_modulus, carry_modulus,
|
||||
message_modulus - 1);
|
||||
}
|
||||
|
||||
host_compute_shifted_blocks_and_borrow_states<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lhsrhs_array, params,
|
||||
mem->shifted_blocks_borrow_state_mem, bsks, ksks, num_blocks, lut_stride,
|
||||
lut_count);
|
||||
|
||||
auto borrow_states = mem->shifted_blocks_borrow_state_mem->borrow_states;
|
||||
cuda_memcpy_async_gpu_to_gpu(mem->overflow_block,
|
||||
borrow_states + (num_blocks - 1) * big_lwe_size,
|
||||
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
|
||||
host_compute_propagation_simulators_and_group_carries<Torus>(
|
||||
streams, gpu_indexes, gpu_count, borrow_states, params,
|
||||
mem->prop_simu_group_carries_mem, bsks, ksks, num_blocks, num_groups);
|
||||
|
||||
auto shifted_blocks = mem->shifted_blocks_borrow_state_mem->shifted_blocks;
|
||||
auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks;
|
||||
auto simulators = mem->prop_simu_group_carries_mem->simulators;
|
||||
|
||||
host_subtraction<Torus>(streams[0], gpu_indexes[0], prepared_blocks,
|
||||
shifted_blocks, simulators, big_lwe_dimension,
|
||||
num_blocks);
|
||||
|
||||
// unchecked_scalar_add_ssing
|
||||
host_integer_radix_add_scalar_one_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, prepared_blocks, big_lwe_dimension,
|
||||
num_blocks, message_modulus, carry_modulus);
|
||||
|
||||
// unchecked_add_assing in overflow_block
|
||||
if (compute_overflow == 1) {
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
|
||||
mem->overflow_block,
|
||||
mem->prop_simu_group_carries_mem->simulators +
|
||||
(num_blocks - 1) * big_lwe_size,
|
||||
big_lwe_dimension, 1);
|
||||
}
|
||||
auto resolved_borrows = mem->prop_simu_group_carries_mem->resolved_carries;
|
||||
|
||||
// This needs to be done before because in next step we modify the resolved
|
||||
// borrows
|
||||
if (compute_overflow == 1) {
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
|
||||
mem->overflow_block,
|
||||
resolved_borrows + (num_groups - 1) * big_lwe_size,
|
||||
big_lwe_dimension, 1);
|
||||
}
|
||||
|
||||
cudaEventRecord(mem->incoming_events[0], streams[0]);
|
||||
cudaStreamWaitEvent(mem->sub_streams_1[0], mem->incoming_events[0], 0);
|
||||
cudaStreamWaitEvent(mem->sub_streams_2[0], mem->incoming_events[0], 0);
|
||||
|
||||
if (compute_overflow == 1) {
|
||||
auto borrow_flag = mem->lut_borrow_flag;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->sub_streams_1, gpu_indexes, gpu_count, overflow_block,
|
||||
mem->overflow_block, bsks, ksks, 1, borrow_flag);
|
||||
}
|
||||
cudaEventRecord(mem->outgoing_events1[0], mem->sub_streams_1[0]);
|
||||
// subtract borrow and cleanup prepared blocks
|
||||
host_negation<Torus>(mem->sub_streams_2[0], gpu_indexes[0], resolved_borrows,
|
||||
resolved_borrows, big_lwe_dimension, num_groups);
|
||||
|
||||
host_radix_sum_in_groups<Torus>(
|
||||
mem->sub_streams_2[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
|
||||
resolved_borrows, num_blocks, big_lwe_size, mem->group_size);
|
||||
|
||||
auto message_extract = mem->lut_message_extract;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->sub_streams_2, gpu_indexes, gpu_count, lhsrhs_array, prepared_blocks,
|
||||
bsks, ksks, num_blocks, message_extract);
|
||||
|
||||
cudaEventRecord(mem->outgoing_events2[0], mem->sub_streams_2[0]);
|
||||
|
||||
cudaStreamWaitEvent(streams[0], mem->outgoing_events1[0], 0);
|
||||
cudaStreamWaitEvent(streams[0], mem->outgoing_events2[0], 0);
|
||||
}
|
||||
|
||||
#endif // TFHE_RS_INTERNAL_INTEGER_CUH
|
||||
|
||||
@@ -67,11 +67,12 @@ void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in,
|
||||
*/
|
||||
void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log,
|
||||
uint32_t ks_level, uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
int8_t **mem_ptr, bool const is_boolean_left, bool const is_boolean_right,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
|
||||
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
polynomial_size * glwe_dimension, lwe_dimension,
|
||||
@@ -88,8 +89,8 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
case 16384:
|
||||
scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
|
||||
(cudaStream_t const *)(streams), gpu_indexes, gpu_count,
|
||||
(int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
(int_mul_memory<uint64_t> **)mem_ptr, is_boolean_left, is_boolean_right,
|
||||
num_radix_blocks, params, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
@@ -126,65 +127,66 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
*/
|
||||
void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void const *radix_lwe_left,
|
||||
void const *radix_lwe_right, void *const *bsks, void *const *ksks,
|
||||
int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) {
|
||||
void *radix_lwe_out, void const *radix_lwe_left, bool const is_bool_left,
|
||||
void const *radix_lwe_right, bool const is_bool_right, void *const *bsks,
|
||||
void *const *ksks, int8_t *mem_ptr, uint32_t polynomial_size,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<const uint64_t *>(radix_lwe_left),
|
||||
static_cast<const uint64_t *>(radix_lwe_right), bsks,
|
||||
static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
|
||||
static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 512:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<const uint64_t *>(radix_lwe_left),
|
||||
static_cast<const uint64_t *>(radix_lwe_right), bsks,
|
||||
static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
|
||||
static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<const uint64_t *>(radix_lwe_left),
|
||||
static_cast<const uint64_t *>(radix_lwe_right), bsks,
|
||||
static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
|
||||
static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<const uint64_t *>(radix_lwe_left),
|
||||
static_cast<const uint64_t *>(radix_lwe_right), bsks,
|
||||
static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
|
||||
static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<const uint64_t *>(radix_lwe_left),
|
||||
static_cast<const uint64_t *>(radix_lwe_right), bsks,
|
||||
static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
|
||||
static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<const uint64_t *>(radix_lwe_left),
|
||||
static_cast<const uint64_t *>(radix_lwe_right), bsks,
|
||||
static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
|
||||
static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<const uint64_t *>(radix_lwe_left),
|
||||
static_cast<const uint64_t *>(radix_lwe_right), bsks,
|
||||
static_cast<const uint64_t *>(radix_lwe_left), is_bool_left,
|
||||
static_cast<const uint64_t *>(radix_lwe_right), is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "helper_multi_gpu.h"
|
||||
#include "integer/cmux.cuh"
|
||||
#include "integer/integer.cuh"
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "linear_algebra.h"
|
||||
@@ -453,7 +454,8 @@ template <typename Torus, class params>
|
||||
__host__ void host_integer_mult_radix_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, uint64_t *radix_lwe_out, uint64_t const *radix_lwe_left,
|
||||
uint64_t const *radix_lwe_right, void *const *bsks, uint64_t *const *ksks,
|
||||
bool const is_bool_left, uint64_t const *radix_lwe_right,
|
||||
bool const is_bool_right, void *const *bsks, uint64_t *const *ksks,
|
||||
int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {
|
||||
|
||||
auto glwe_dimension = mem_ptr->params.glwe_dimension;
|
||||
@@ -464,6 +466,20 @@ __host__ void host_integer_mult_radix_kb(
|
||||
|
||||
int big_lwe_dimension = glwe_dimension * polynomial_size;
|
||||
|
||||
if (is_bool_right) {
|
||||
zero_out_if<Torus>(streams, gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_left, radix_lwe_right, mem_ptr->zero_out_mem,
|
||||
mem_ptr->zero_out_predicate_lut, bsks, ksks, num_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_bool_left) {
|
||||
zero_out_if<Torus>(streams, gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_right, radix_lwe_left, mem_ptr->zero_out_mem,
|
||||
mem_ptr->zero_out_predicate_lut, bsks, ksks, num_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
// 'vector_result_lsb' contains blocks from all possible right shifts of
|
||||
// radix_lwe_left, only nonzero blocks are kept
|
||||
int lsb_vector_block_count = num_blocks * (num_blocks + 1) / 2;
|
||||
@@ -562,30 +578,21 @@ __host__ void host_integer_mult_radix_kb(
|
||||
terms_degree, bsks, ksks, mem_ptr->sum_ciphertexts_mem, num_blocks,
|
||||
2 * num_blocks, mem_ptr->luts_array);
|
||||
|
||||
uint32_t block_modulus = message_modulus * carry_modulus;
|
||||
uint32_t num_bits_in_block = std::log2(block_modulus);
|
||||
// if (num_blocks < num_bits_in_block) {
|
||||
// auto scp_mem_ptr = mem_ptr->sum_ciphertexts_mem->scp_mem;
|
||||
// host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
|
||||
// radix_lwe_out, nullptr, nullptr,
|
||||
// scp_mem_ptr, bsks, ksks, num_blocks);
|
||||
// } else {
|
||||
auto fast_scp_mem_ptr = mem_ptr->fast_sc_prop_mem;
|
||||
uint32_t requested_flag = 0;
|
||||
uint32_t uses_carry = 0;
|
||||
host_fast_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, nullptr,
|
||||
fast_scp_mem_ptr, bsks, ksks, num_blocks, requested_flag, uses_carry);
|
||||
//}
|
||||
auto scp_mem_ptr = mem_ptr->sum_ciphertexts_mem->scp_mem;
|
||||
host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
|
||||
radix_lwe_out, nullptr, nullptr,
|
||||
scp_mem_ptr, bsks, ksks, num_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_mul_memory<Torus> **mem_ptr,
|
||||
bool const is_boolean_left, bool const is_boolean_right,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
*mem_ptr = new int_mul_memory<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
is_boolean_left, is_boolean_right,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
|
||||
@@ -38,7 +38,15 @@ void cuda_integer_radix_overflowing_sub_kb_64(
|
||||
void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_blocks) {
|
||||
|
||||
// auto mem = (int_overflowing_sub_memory<uint64_t> *)mem_ptr;
|
||||
auto mem = (int_overflowing_sub_memory<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_overflowing_sub_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_overflowed),
|
||||
static_cast<const uint64_t *>(radix_lwe_left),
|
||||
static_cast<const uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
|
||||
mem, num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams,
|
||||
|
||||
@@ -91,7 +91,7 @@ __host__ void scratch_cuda_integer_overflowing_sub_kb(
|
||||
*mem_ptr = new int_overflowing_sub_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
|
||||
}
|
||||
/*
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_overflowing_sub_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -113,39 +113,4 @@ __host__ void host_integer_overflowing_sub_kb(
|
||||
mem_ptr, bsks, ksks, num_blocks);
|
||||
}
|
||||
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_overflowing_sub(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_out_array, Torus *lhs_array,
|
||||
const Torus *rhs_array, Torus *overflow_block, const Torus *input_borrow,
|
||||
int_fast_borrow_prop_memory<uint64_t> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
|
||||
uint32_t uses_input_borrow) {
|
||||
|
||||
auto radix_params = mem_ptr->params;
|
||||
|
||||
// We need to recalculate the num_groups, because on the division the number
|
||||
// of num_blocks changes
|
||||
uint32_t block_modulus =
|
||||
radix_params.message_modulus * radix_params.carry_modulus;
|
||||
uint32_t num_bits_in_block = std::log2(block_modulus);
|
||||
uint32_t grouping_size = num_bits_in_block;
|
||||
uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;
|
||||
|
||||
auto stream = (cudaStream_t *)streams;
|
||||
host_unchecked_sub_with_correcting_term<Torus>(
|
||||
stream[0], gpu_indexes[0], static_cast<Torus *>(lwe_out_array),
|
||||
static_cast<Torus *>(lhs_array), static_cast<const Torus *>(rhs_array),
|
||||
radix_params.big_lwe_dimension, num_blocks, radix_params.message_modulus,
|
||||
radix_params.carry_modulus, radix_params.message_modulus - 1);
|
||||
|
||||
host_fast_borrow_propagate<Torus>(
|
||||
streams, gpu_indexes, gpu_count, static_cast<Torus *>(lwe_out_array),
|
||||
static_cast<Torus *>(overflow_block),
|
||||
static_cast<const Torus *>(input_borrow),
|
||||
(int_fast_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
|
||||
num_blocks, num_groups, compute_overflow, uses_input_borrow);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -112,24 +112,10 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem,
|
||||
num_radix_blocks, j, nullptr);
|
||||
|
||||
// uint32_t carry_modulus = message_modulus;
|
||||
// uint32_t block_modulus = message_modulus * carry_modulus;
|
||||
// uint32_t num_bits_in_block = std::log2(block_modulus);
|
||||
// if (num_radix_blocks < num_bits_in_block) {
|
||||
// auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem;
|
||||
// host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count,
|
||||
// lwe_array,
|
||||
// nullptr, nullptr, scp_mem_ptr, bsks,
|
||||
// ksks, num_radix_blocks);
|
||||
// } else {
|
||||
auto fast_scp_mem_ptr = mem->fast_sc_prop_mem;
|
||||
uint32_t requested_flag = 0;
|
||||
uint32_t uses_carry = 0;
|
||||
host_fast_propagate_single_carry<T>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, nullptr, nullptr,
|
||||
fast_scp_mem_ptr, bsks, ksks, num_radix_blocks, requested_flag,
|
||||
uses_carry);
|
||||
//}
|
||||
auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem;
|
||||
host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count, lwe_array,
|
||||
nullptr, nullptr, scp_mem_ptr, bsks, ksks,
|
||||
num_radix_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -57,27 +57,6 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
|
||||
static_cast<const uint64_t *>(lwe_array_in_2),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
}
|
||||
// last block it is the packing lhs*message_modulus + rhs
|
||||
void cuda_add_lwe_ciphertext_vector_64_with_packing(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in_1, void const *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus) {
|
||||
|
||||
host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_array_in_1),
|
||||
static_cast<const uint64_t *>(lwe_array_in_2),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count - 1);
|
||||
|
||||
host_pack_for_overflowing_ops<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_array_in_1),
|
||||
static_cast<const uint64_t *>(lwe_array_in_2), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count, message_modulus);
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform the addition of a u32 input LWE ciphertext vector with a u32
|
||||
* plaintext vector. See the equivalent operation on u64 data for more details.
|
||||
|
||||
@@ -82,45 +82,6 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void pack_for_overflowing_ops(T *output, T const *input_1,
|
||||
T const *input_2, uint32_t num_entries,
|
||||
uint32_t message_modulus) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int index = blockIdx.x * blockDim.x + tid;
|
||||
if (index < num_entries) {
|
||||
// Here we take advantage of the wrapping behaviour of uint
|
||||
output[index] = input_1[index] * message_modulus + input_2[index];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void host_pack_for_overflowing_ops(cudaStream_t stream,
|
||||
uint32_t gpu_index, T *output,
|
||||
T const *input_1, T const *input_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = lwe_size;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
pack_for_overflowing_ops<T><<<grid, thds, 0, stream>>>(
|
||||
&output[input_lwe_ciphertext_count - 1],
|
||||
&input_1[input_lwe_ciphertext_count - 1],
|
||||
&input_2[input_lwe_ciphertext_count - 1], lwe_size, message_modulus);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void subtraction(T *output, T const *input_1, T const *input_2,
|
||||
uint32_t num_entries) {
|
||||
|
||||
@@ -24,8 +24,8 @@ __device__ const T *get_ith_mask_kth_block(const T *ptr, int i, int k,
|
||||
uint32_t level_count) {
|
||||
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
|
||||
level_count) +
|
||||
level * polynomial_size / 2 * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) +
|
||||
(level_count - level - 1) * polynomial_size / 2 *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * (glwe_dimension + 1)];
|
||||
}
|
||||
|
||||
@@ -35,8 +35,8 @@ __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
|
||||
int glwe_dimension, uint32_t level_count) {
|
||||
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
|
||||
level_count) +
|
||||
level * polynomial_size / 2 * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) +
|
||||
(level_count - level - 1) * polynomial_size / 2 *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * (glwe_dimension + 1)];
|
||||
}
|
||||
template <typename T>
|
||||
@@ -45,8 +45,8 @@ __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
|
||||
int glwe_dimension, uint32_t level_count) {
|
||||
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
|
||||
level_count) +
|
||||
level * polynomial_size / 2 * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) +
|
||||
(level_count - level - 1) * polynomial_size / 2 *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * (glwe_dimension + 1) +
|
||||
glwe_dimension * polynomial_size / 2];
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "helper_multi_gpu.h"
|
||||
#include "pbs/programmable_bootstrap_multibit.h"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
|
||||
using namespace cooperative_groups;
|
||||
namespace cg = cooperative_groups;
|
||||
@@ -20,59 +21,43 @@ get_join_buffer_element(int level_id, int glwe_id, G &group,
|
||||
double2 *global_memory_buffer, uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension, bool support_dsm);
|
||||
|
||||
template <typename Torus, typename G, class params>
|
||||
/** Perform the matrix multiplication between the GGSW and the GLWE,
|
||||
* each block operating on a single level for mask and body.
|
||||
* Both operands should be at fourier domain
|
||||
*
|
||||
* This function assumes:
|
||||
* - Thread blocks at dimension x relates to the decomposition level.
|
||||
* - Thread blocks at dimension y relates to the glwe dimension.
|
||||
* - polynomial_size / params::opt threads are available per block
|
||||
*/
|
||||
template <typename G, class params>
|
||||
__device__ void
|
||||
mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer,
|
||||
const double2 *__restrict__ bootstrapping_key,
|
||||
int polynomial_size, uint32_t glwe_dimension, int level_count,
|
||||
int iteration, G &group, bool support_dsm = false) {
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Get the pieces of the bootstrapping key that will be needed for the
|
||||
// external product; blockIdx.x is the ID of the block that's executing
|
||||
// this function, so we end up getting the lines of the bootstrapping key
|
||||
// needed to perform the external product in this block (corresponding to
|
||||
// the same decomposition level)
|
||||
auto bsk_slice = get_ith_mask_kth_block(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
|
||||
// Perform the matrix multiplication between the GGSW and the GLWE,
|
||||
// each block operating on a single level for mask and body
|
||||
mul_ggsw_glwe_in_fourier_domain(double2 *fft, double2 *join_buffer,
|
||||
const double2 *__restrict__ bootstrapping_key,
|
||||
int iteration, G &group,
|
||||
bool support_dsm = false) {
|
||||
const uint32_t polynomial_size = params::degree;
|
||||
const uint32_t glwe_dimension = gridDim.y - 1;
|
||||
const uint32_t level_count = gridDim.x;
|
||||
|
||||
// The first product is used to initialize level_join_buffer
|
||||
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
|
||||
auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
|
||||
auto buffer_slice =
|
||||
get_join_buffer_element<G>(blockIdx.x, blockIdx.y, group, join_buffer,
|
||||
polynomial_size, glwe_dimension, support_dsm);
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
buffer_slice[tid] = fft[tid] * bsk_poly[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
group.sync();
|
||||
|
||||
// Continues multiplying fft by every polynomial in that particular bsk level
|
||||
// Each y-block accumulates in a different polynomial at each iteration
|
||||
for (int j = 1; j < (glwe_dimension + 1); j++) {
|
||||
auto bsk_slice = get_ith_mask_kth_block(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
for (int j = 0; j < glwe_dimension + 1; j++) {
|
||||
int idx = (j + this_block_rank) % (glwe_dimension + 1);
|
||||
|
||||
auto bsk_poly = bsk_slice + idx * params::degree / 2;
|
||||
auto bsk_poly = bsk_slice + idx * polynomial_size / 2;
|
||||
auto buffer_slice = get_join_buffer_element<G>(blockIdx.x, idx, group,
|
||||
join_buffer, polynomial_size,
|
||||
glwe_dimension, support_dsm);
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
buffer_slice[tid] += fft[tid] * bsk_poly[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
buffer_slice, fft, bsk_poly, j == 0);
|
||||
group.sync();
|
||||
}
|
||||
|
||||
@@ -80,40 +65,16 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer,
|
||||
// All blocks are synchronized here; after this sync, level_join_buffer has
|
||||
// the values needed from every other block
|
||||
|
||||
auto src_acc =
|
||||
get_join_buffer_element<G>(0, blockIdx.y, group, join_buffer,
|
||||
polynomial_size, glwe_dimension, support_dsm);
|
||||
|
||||
// copy first product into fft buffer
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// accumulate rest of the products into fft buffer
|
||||
for (int l = 1; l < gridDim.x; l++) {
|
||||
for (int l = 0; l < level_count; l++) {
|
||||
auto cur_src_acc = get_join_buffer_element<G>(l, blockIdx.y, group,
|
||||
join_buffer, polynomial_size,
|
||||
glwe_dimension, support_dsm);
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] += cur_src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
polynomial_accumulate_in_fourier_domain<params>(fft, cur_src_acc, l == 0);
|
||||
}
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
|
||||
// accumulator
|
||||
NSMFFT_inverse<HalfDegree<params>>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(fft, accumulator);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
|
||||
@@ -117,8 +117,8 @@ __global__ void device_programmable_bootstrap_amortized(
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
init_decomposer_state_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_rotated, base_log, level_count, glwe_dimension + 1);
|
||||
|
||||
// Initialize the polynomial multiplication via FFT arrays
|
||||
|
||||
@@ -117,8 +117,8 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
init_decomposer_state_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_rotated, base_log, level_count);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
@@ -129,18 +129,16 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
|
||||
accumulator_rotated);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
// accumulator_rotated, so we need to synchronize here to make sure they
|
||||
// don't modify the same memory space at the same time
|
||||
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform G^-1(ACC) * GGSW -> GLWE
|
||||
mul_ggsw_glwe<Torus, grid_group, params>(
|
||||
accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
|
||||
polynomial_size, glwe_dimension, level_count, i, grid);
|
||||
|
||||
mul_ggsw_glwe_in_fourier_domain<grid_group, params>(
|
||||
accumulator_fft, block_join_buffer, bootstrapping_key, i, grid);
|
||||
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(accumulator_fft, accumulator);
|
||||
}
|
||||
|
||||
auto block_lwe_array_out =
|
||||
@@ -148,40 +146,42 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result, but
|
||||
// we do the computation at block 0 to avoid waiting for extra blocks, in
|
||||
// case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
if (blockIdx.x == 0) {
|
||||
if (blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 1, i * lut_stride);
|
||||
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 1, i * lut_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -254,7 +254,7 @@ __host__ void host_programmable_bootstrap_cg(
|
||||
uint64_t partial_dm = full_dm - partial_sm;
|
||||
|
||||
int8_t *d_mem = buffer->d_mem;
|
||||
double2 *buffer_fft = buffer->global_accumulator_fft;
|
||||
double2 *buffer_fft = buffer->global_join_buffer;
|
||||
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
|
||||
|
||||
@@ -33,7 +33,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block,
|
||||
uint32_t lut_count, uint32_t lut_stride) {
|
||||
|
||||
grid_group grid = this_grid();
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
@@ -50,9 +49,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
selected_memory = &device_mem[block_index * device_memory_size_per_block];
|
||||
}
|
||||
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
Torus *accumulator_rotated = (Torus *)selected_memory;
|
||||
double2 *accumulator_fft =
|
||||
(double2 *)accumulator +
|
||||
(double2 *)accumulator_rotated +
|
||||
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
|
||||
|
||||
if constexpr (SMD == PARTIALSM)
|
||||
@@ -71,13 +70,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
|
||||
params::degree / 2];
|
||||
|
||||
Torus *global_slice =
|
||||
global_accumulator +
|
||||
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
|
||||
Torus *global_accumulator_slice =
|
||||
&global_accumulator[(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) *
|
||||
params::degree];
|
||||
|
||||
const double2 *keybundle = keybundle_array +
|
||||
// select the input
|
||||
blockIdx.z * keybundle_size_per_input;
|
||||
const double2 *keybundle =
|
||||
&keybundle_array[blockIdx.z * keybundle_size_per_input];
|
||||
|
||||
if (lwe_offset == 0) {
|
||||
// Put "b" in [0, 2N[
|
||||
@@ -87,92 +85,95 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
accumulator_rotated, &block_lut_vector[blockIdx.y * params::degree],
|
||||
b_hat, false);
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
// Load the accumulator_rotated calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
global_slice, accumulator);
|
||||
global_accumulator_slice, accumulator_rotated);
|
||||
}
|
||||
|
||||
for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, base_log, level_count);
|
||||
init_decomposer_state_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_rotated, base_log, level_count);
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// Decompose the accumulator_rotated. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
|
||||
// accumulator_rotated decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
|
||||
accumulator_rotated);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
// accumulator_rotated, so we need to synchronize here to make sure they
|
||||
// don't modify the same memory space at the same time
|
||||
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform G^-1(ACC) * GGSW -> GLWE
|
||||
mul_ggsw_glwe<Torus, grid_group, params>(
|
||||
accumulator, accumulator_fft, block_join_buffer, keybundle,
|
||||
polynomial_size, glwe_dimension, level_count, i, grid);
|
||||
|
||||
mul_ggsw_glwe_in_fourier_domain<grid_group, params>(
|
||||
accumulator_fft, block_join_buffer, keybundle, i, grid);
|
||||
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(accumulator_fft, accumulator_rotated, true);
|
||||
}
|
||||
|
||||
if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
auto accumulator = accumulator_rotated;
|
||||
|
||||
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
// Always extract one by default
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
if (blockIdx.x == 0) {
|
||||
if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
if (blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra
|
||||
// blocks, in case they're not synchronized Always extract one by
|
||||
// default
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
|
||||
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 1, i * lut_stride);
|
||||
}
|
||||
}
|
||||
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 1, i * lut_stride);
|
||||
}
|
||||
}
|
||||
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, global_accumulator_slice);
|
||||
}
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, global_slice);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -295,15 +296,18 @@ __host__ void execute_cg_external_product_loop(
|
||||
uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count,
|
||||
uint32_t lut_stride) {
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
uint64_t full_dm =
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_dm =
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
auto full_dm = full_sm;
|
||||
auto partial_dm = full_sm - partial_sm;
|
||||
uint64_t no_dm = 0;
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
@@ -313,13 +317,11 @@ __host__ void execute_cg_external_product_loop(
|
||||
|
||||
uint32_t chunk_size =
|
||||
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
if (chunk_size == 0)
|
||||
return;
|
||||
|
||||
auto d_mem = buffer->d_mem_acc_cg;
|
||||
auto keybundle_fft = buffer->keybundle_fft;
|
||||
auto global_accumulator = buffer->global_accumulator;
|
||||
auto buffer_fft = buffer->global_accumulator_fft;
|
||||
auto join_buffer = buffer->global_join_buffer;
|
||||
|
||||
void *kernel_args[22];
|
||||
kernel_args[0] = &lwe_array_out;
|
||||
@@ -329,7 +331,7 @@ __host__ void execute_cg_external_product_loop(
|
||||
kernel_args[4] = &lwe_array_in;
|
||||
kernel_args[5] = &lwe_input_indexes;
|
||||
kernel_args[6] = &keybundle_fft;
|
||||
kernel_args[7] = &buffer_fft;
|
||||
kernel_args[7] = &join_buffer;
|
||||
kernel_args[8] = &global_accumulator;
|
||||
kernel_args[9] = &lwe_dimension;
|
||||
kernel_args[10] = &glwe_dimension;
|
||||
@@ -358,13 +360,13 @@ __host__ void execute_cg_external_product_loop(
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
|
||||
Torus, params, PARTIALSM>,
|
||||
grid_accumulate, thds, (void **)kernel_args, partial_dm, stream));
|
||||
grid_accumulate, thds, (void **)kernel_args, partial_sm, stream));
|
||||
} else {
|
||||
kernel_args[19] = &no_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
|
||||
Torus, params, FULLSM>,
|
||||
grid_accumulate, thds, (void **)kernel_args, full_dm, stream));
|
||||
grid_accumulate, thds, (void **)kernel_args, full_sm, stream));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -654,8 +654,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) {
|
||||
if (base_log > 32)
|
||||
PANIC("Cuda error (classical PBS): base log should be <= 32")
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (classical PBS): base log should be <= 64")
|
||||
|
||||
pbs_buffer<uint64_t, CLASSICAL> *buffer =
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
|
||||
|
||||
@@ -25,7 +25,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const double2 *__restrict__ bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
Torus *global_accumulator, double2 *global_join_buffer,
|
||||
uint32_t lwe_iteration, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block) {
|
||||
@@ -67,10 +67,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
|
||||
|
||||
double2 *global_fft_slice =
|
||||
global_accumulator_fft +
|
||||
(blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
|
||||
blockIdx.z * level_count * (glwe_dimension + 1)) *
|
||||
(polynomial_size / 2);
|
||||
global_join_buffer + (blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
|
||||
blockIdx.z * level_count * (glwe_dimension + 1)) *
|
||||
(polynomial_size / 2);
|
||||
|
||||
if (lwe_iteration == 0) {
|
||||
// First iteration
|
||||
@@ -107,8 +106,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
init_decomposer_state_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, base_log, level_count);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
@@ -139,7 +138,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes,
|
||||
const double2 *__restrict__ bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
Torus *global_accumulator, double2 *global_join_buffer,
|
||||
uint32_t lwe_iteration, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block,
|
||||
@@ -171,9 +170,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
accumulator_fft = (double2 *)sharedmem;
|
||||
|
||||
for (int level = 0; level < level_count; level++) {
|
||||
double2 *global_fft_slice = global_accumulator_fft +
|
||||
(level + blockIdx.x * level_count) *
|
||||
(glwe_dimension + 1) * (params::degree / 2);
|
||||
double2 *global_fft_slice =
|
||||
global_join_buffer + (level + blockIdx.x * level_count) *
|
||||
(glwe_dimension + 1) * (params::degree / 2);
|
||||
|
||||
for (int j = 0; j < (glwe_dimension + 1); j++) {
|
||||
double2 *fft = global_fft_slice + j * params::degree / 2;
|
||||
@@ -292,7 +291,7 @@ uint64_t get_buffer_size_programmable_bootstrap(
|
||||
}
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
uint64_t buffer_size = device_mem +
|
||||
// global_accumulator_fft
|
||||
// global_join_buffer
|
||||
(glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count *
|
||||
(polynomial_size / 2) * sizeof(double2) +
|
||||
@@ -368,7 +367,7 @@ __host__ void execute_step_one(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
Torus *global_accumulator, double2 *global_join_buffer,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
|
||||
@@ -383,21 +382,21 @@ __host__ void execute_step_one(
|
||||
device_programmable_bootstrap_step_one<Torus, params, NOSM>
|
||||
<<<grid, thds, 0, stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
bootstrapping_key, global_accumulator, global_join_buffer,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, full_dm);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, partial_sm, stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
bootstrapping_key, global_accumulator, global_join_buffer,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, partial_dm);
|
||||
} else {
|
||||
device_programmable_bootstrap_step_one<Torus, params, FULLSM>
|
||||
<<<grid, thds, full_sm, stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
bootstrapping_key, global_accumulator, global_join_buffer,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, 0);
|
||||
}
|
||||
@@ -409,7 +408,7 @@ __host__ void execute_step_two(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, double2 const *bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
Torus *global_accumulator, double2 *global_join_buffer,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
|
||||
@@ -425,21 +424,21 @@ __host__ void execute_step_two(
|
||||
device_programmable_bootstrap_step_two<Torus, params, NOSM>
|
||||
<<<grid, thds, 0, stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
bootstrapping_key, global_accumulator, global_join_buffer,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, full_dm, lut_count, lut_stride);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, partial_sm, stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
bootstrapping_key, global_accumulator, global_join_buffer,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, partial_dm, lut_count, lut_stride);
|
||||
} else {
|
||||
device_programmable_bootstrap_step_two<Torus, params, FULLSM>
|
||||
<<<grid, thds, full_sm, stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
bootstrapping_key, global_accumulator, global_join_buffer,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, 0, lut_count, lut_stride);
|
||||
}
|
||||
@@ -478,20 +477,20 @@ __host__ void host_programmable_bootstrap(
|
||||
uint64_t full_dm_step_two = full_sm_step_two;
|
||||
|
||||
Torus *global_accumulator = pbs_buffer->global_accumulator;
|
||||
double2 *global_accumulator_fft = pbs_buffer->global_accumulator_fft;
|
||||
double2 *global_join_buffer = pbs_buffer->global_join_buffer;
|
||||
int8_t *d_mem = pbs_buffer->d_mem;
|
||||
|
||||
for (int i = 0; i < lwe_dimension; i++) {
|
||||
execute_step_one<Torus, params>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, global_accumulator,
|
||||
global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
|
||||
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
|
||||
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
|
||||
execute_step_two<Torus, params>(
|
||||
stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, bootstrapping_key, global_accumulator,
|
||||
global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
|
||||
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
|
||||
partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
|
||||
lut_count, lut_stride);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user