mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 15:48:20 -05:00
Compare commits
300 Commits
ns/test/pr
...
al/remove_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
47e847dab4 | ||
|
|
c6ff00c500 | ||
|
|
54a08afb46 | ||
|
|
6c8591dc21 | ||
|
|
876cde1f6a | ||
|
|
ee938797c3 | ||
|
|
2311087a64 | ||
|
|
7dfabdd4b5 | ||
|
|
212af17538 | ||
|
|
c7f4de9a21 | ||
|
|
2b25b20aeb | ||
|
|
4a930264f5 | ||
|
|
2498087610 | ||
|
|
375481c66e | ||
|
|
cb9dac6eed | ||
|
|
04c6f18d42 | ||
|
|
75d2457a6f | ||
|
|
dedb3e94e5 | ||
|
|
766809afe4 | ||
|
|
22728b9156 | ||
|
|
59380fcacb | ||
|
|
b50029fcff | ||
|
|
7f9ba6ed28 | ||
|
|
9f6e7cd3fc | ||
|
|
b14db1e3fd | ||
|
|
3b4cb6b1fc | ||
|
|
81c16e7915 | ||
|
|
0fc24127a2 | ||
|
|
e9d3e21b93 | ||
|
|
53c4850d11 | ||
|
|
03154d5db8 | ||
|
|
576bc5782e | ||
|
|
8256e76f74 | ||
|
|
835cc6d9b0 | ||
|
|
c9be958d1a | ||
|
|
5183c1fb3e | ||
|
|
0d49d19a13 | ||
|
|
e91d532a36 | ||
|
|
1c2a0e82f9 | ||
|
|
e76503984a | ||
|
|
5cfc57f51a | ||
|
|
840498977c | ||
|
|
77a34a952e | ||
|
|
d9e9a5bb3f | ||
|
|
03431e41a9 | ||
|
|
5d522ffeaa | ||
|
|
3956f96318 | ||
|
|
7192ecb695 | ||
|
|
40b097d819 | ||
|
|
45effa41d5 | ||
|
|
d2efa82daf | ||
|
|
bd66a6fd2b | ||
|
|
16feb46afc | ||
|
|
81d82bc45c | ||
|
|
7afe9b71d2 | ||
|
|
41fae73e63 | ||
|
|
de7c7f209f | ||
|
|
84de0a7b23 | ||
|
|
4bb115e1e7 | ||
|
|
b365585c74 | ||
|
|
ea3ec8cbdd | ||
|
|
8c51e22aa5 | ||
|
|
283a3c911b | ||
|
|
2bf483c596 | ||
|
|
2e0736afc6 | ||
|
|
400ce27beb | ||
|
|
43d91f512f | ||
|
|
5db5aba24a | ||
|
|
361c9618a0 | ||
|
|
35dac0d85c | ||
|
|
1c0b6fbbd4 | ||
|
|
8c6e916076 | ||
|
|
49ab72bcec | ||
|
|
937b72c538 | ||
|
|
0259886375 | ||
|
|
97822db5fc | ||
|
|
934b5f40a1 | ||
|
|
3ff81c3c4b | ||
|
|
bce5cd3552 | ||
|
|
ec83165acc | ||
|
|
d63c2f7705 | ||
|
|
5bcc34728a | ||
|
|
b62228b429 | ||
|
|
b63347336b | ||
|
|
a631904bd1 | ||
|
|
da850865ec | ||
|
|
8be769e282 | ||
|
|
47ea8bf45c | ||
|
|
4823b8a1a0 | ||
|
|
01f3a6d133 | ||
|
|
bf613f36b3 | ||
|
|
faf200218b | ||
|
|
24088fd494 | ||
|
|
48315dca80 | ||
|
|
52b148a728 | ||
|
|
d0624d6184 | ||
|
|
00fc2818a9 | ||
|
|
b93c23e5f8 | ||
|
|
1c59c1c260 | ||
|
|
ca7b29163e | ||
|
|
f7a18ddb23 | ||
|
|
7b9085d0e2 | ||
|
|
d52fa249a5 | ||
|
|
35e7031751 | ||
|
|
d9662daea5 | ||
|
|
32cdb0b5a0 | ||
|
|
a6aa95ce2d | ||
|
|
97d7ed9ec2 | ||
|
|
07045f1137 | ||
|
|
3ab7f49436 | ||
|
|
040e28d822 | ||
|
|
a113674c82 | ||
|
|
1d06691dda | ||
|
|
fc21804f3e | ||
|
|
c0878f1600 | ||
|
|
97f1277e06 | ||
|
|
e1dd4ba4bf | ||
|
|
d96a368b37 | ||
|
|
47c8d4cf64 | ||
|
|
9633b61298 | ||
|
|
8299e1cb9a | ||
|
|
72ad76b5e7 | ||
|
|
0e6423820f | ||
|
|
c45ee6a236 | ||
|
|
cf7b21f1af | ||
|
|
f9026f1563 | ||
|
|
95ab73cbaa | ||
|
|
35faaef431 | ||
|
|
a2ae1a4440 | ||
|
|
077d5727da | ||
|
|
8314e7d47c | ||
|
|
9dca245946 | ||
|
|
345f25c5c3 | ||
|
|
c6756748f7 | ||
|
|
bd21971c84 | ||
|
|
e96ad74006 | ||
|
|
abd87a0f0c | ||
|
|
3875c97574 | ||
|
|
6fabe6bab0 | ||
|
|
91171c738d | ||
|
|
7bf0dc157d | ||
|
|
0612ef5be5 | ||
|
|
aee4c1ed18 | ||
|
|
e2a3ef151a | ||
|
|
6f77bea5e0 | ||
|
|
e4f72dab30 | ||
|
|
7ed3fded4a | ||
|
|
488c942a3a | ||
|
|
c0d98394fa | ||
|
|
93ff6992e2 | ||
|
|
2a4026c761 | ||
|
|
39c424b14d | ||
|
|
46a7a3b43b | ||
|
|
38b5759e88 | ||
|
|
d6f8e59394 | ||
|
|
a95db07003 | ||
|
|
6544e6f6a3 | ||
|
|
1d549dfd8a | ||
|
|
019548daa5 | ||
|
|
26b666955a | ||
|
|
ce9da12e65 | ||
|
|
32b45ac4bc | ||
|
|
26055b236e | ||
|
|
ce9e355c15 | ||
|
|
85cc638c62 | ||
|
|
d454b5386b | ||
|
|
426f3bd192 | ||
|
|
4c707e79d8 | ||
|
|
e1afb8126d | ||
|
|
0d1ef0af7e | ||
|
|
15e3474cda | ||
|
|
10be6f9423 | ||
|
|
c521c2ca2e | ||
|
|
39c46056f6 | ||
|
|
aa2b27460c | ||
|
|
c258d53625 | ||
|
|
8ddee20a57 | ||
|
|
1d786b7202 | ||
|
|
7267d60e01 | ||
|
|
0148a6ffc8 | ||
|
|
63571a07ae | ||
|
|
6e2908ad4e | ||
|
|
d3d06c905f | ||
|
|
051f33f166 | ||
|
|
11a8f97a1c | ||
|
|
35a9c323a7 | ||
|
|
641f47b775 | ||
|
|
456d0ced1b | ||
|
|
358bcc9a22 | ||
|
|
27a4564d83 | ||
|
|
296e419f6c | ||
|
|
e1a25a10ac | ||
|
|
d9349b3357 | ||
|
|
68e4ac4896 | ||
|
|
3f318a2046 | ||
|
|
d1380794ed | ||
|
|
fe5641ef6d | ||
|
|
3397aa81d2 | ||
|
|
8f10f8f8db | ||
|
|
92be95c6b8 | ||
|
|
990c4d0380 | ||
|
|
1d5abfd5ea | ||
|
|
dfd1beeb47 | ||
|
|
43a007a2fa | ||
|
|
54faf64ecd | ||
|
|
8fe7f9c3cb | ||
|
|
9ed65db03d | ||
|
|
9413d3e722 | ||
|
|
2000feb87e | ||
|
|
594a5cee25 | ||
|
|
401cfc5fd0 | ||
|
|
769c725c67 | ||
|
|
07d143e032 | ||
|
|
d88bba761b | ||
|
|
eaa1d07f90 | ||
|
|
663322cfa5 | ||
|
|
ddd6a6e136 | ||
|
|
abc39f0a3e | ||
|
|
8b7556667b | ||
|
|
67b1607773 | ||
|
|
5340859003 | ||
|
|
a26e68c3bc | ||
|
|
0dd622ebb9 | ||
|
|
d69dd20079 | ||
|
|
80fe45f354 | ||
|
|
33114e3946 | ||
|
|
ede0745b7f | ||
|
|
bc4cd08e7a | ||
|
|
b03921f1ae | ||
|
|
70f7af06f5 | ||
|
|
a9bb6eac5f | ||
|
|
4fa9b243e0 | ||
|
|
b88f561358 | ||
|
|
0e71ca6c1c | ||
|
|
3ba61c0694 | ||
|
|
781f78c442 | ||
|
|
ebfc1ea8ac | ||
|
|
7fa9f33776 | ||
|
|
5547d92c79 | ||
|
|
351fc476b5 | ||
|
|
53cd3c8d0f | ||
|
|
0a2ad8ca72 | ||
|
|
eba4f6a89c | ||
|
|
4b933cf421 | ||
|
|
3303cd8568 | ||
|
|
f937524f64 | ||
|
|
e7da96271c | ||
|
|
0cc716544b | ||
|
|
f53087b5ed | ||
|
|
bcefe977c9 | ||
|
|
73ea24fd51 | ||
|
|
6f1a9bdaa5 | ||
|
|
7834f699d0 | ||
|
|
b81692b2df | ||
|
|
8748d1cc22 | ||
|
|
dbb13aa35e | ||
|
|
53f4c9bfc7 | ||
|
|
4021812248 | ||
|
|
190c5e7bb7 | ||
|
|
2004333d6e | ||
|
|
e7c06ef956 | ||
|
|
7b14fe6fee | ||
|
|
55f4df97b4 | ||
|
|
2144ec8107 | ||
|
|
fb862ddbbc | ||
|
|
ab0b01f7e1 | ||
|
|
6c4318b8bb | ||
|
|
d3f2ecd367 | ||
|
|
19dc0f02f9 | ||
|
|
95d50368fa | ||
|
|
c117798b10 | ||
|
|
da0934d4bc | ||
|
|
b522de3273 | ||
|
|
9205703454 | ||
|
|
a1b92a6db8 | ||
|
|
8d7c45bf17 | ||
|
|
91f05b00b9 | ||
|
|
ebb11b15c4 | ||
|
|
18270714d8 | ||
|
|
6c6525b1ea | ||
|
|
79f8971712 | ||
|
|
17db09bf2a | ||
|
|
fc9bfcaf61 | ||
|
|
d93c412dc5 | ||
|
|
ea222007d8 | ||
|
|
3470d6c2d8 | ||
|
|
fffdc3862e | ||
|
|
d9eca01631 | ||
|
|
95ef13f6ce | ||
|
|
230fa5a8f0 | ||
|
|
b443855b8b | ||
|
|
ba80c33328 | ||
|
|
e5dc45c084 | ||
|
|
b450f0eb30 | ||
|
|
7479cc826b | ||
|
|
b2beac2d2c | ||
|
|
b700416597 | ||
|
|
42609987a1 | ||
|
|
5b37a838ba | ||
|
|
c1fcd95d72 |
121
.github/workflows/aws_tfhe_backward_compat_tests.yml
vendored
Normal file
121
.github/workflows/aws_tfhe_backward_compat_tests.yml
vendored
Normal file
@@ -0,0 +1,121 @@
|
||||
# Run backward compatibility tests
|
||||
name: Backward compatibility Tests on CPU
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (backward-compat-tests)
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
profile: cpu-small
|
||||
|
||||
backward-compat-tests:
|
||||
name: Backward compatibility tests
|
||||
needs: [ setup-instance ]
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Install git-lfs
|
||||
run: |
|
||||
sudo apt update && sudo apt -y install git-lfs
|
||||
|
||||
- name: Use specific data branch
|
||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
|
||||
env:
|
||||
PR_BRANCH: ${{ github.head_ref || github.ref_name }}
|
||||
run: |
|
||||
echo "BACKWARD_COMPAT_DATA_BRANCH=${PR_BRANCH}" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Get backward compat branch
|
||||
id: backward_compat_branch
|
||||
run: |
|
||||
BRANCH="$(make backward_compat_branch)"
|
||||
echo "branch=${BRANCH}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Clone test data
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
repository: zama-ai/tfhe-backward-compat-data
|
||||
path: tfhe/tfhe-backward-compat-data
|
||||
lfs: 'true'
|
||||
ref: ${{ steps.backward_compat_branch.outputs.branch }}
|
||||
|
||||
- name: Run backward compatibility tests
|
||||
run: |
|
||||
make test_backward_compatibility_ci
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Backward compatibility tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (backward-compat-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, backward-compat-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (backward-compat-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
42
.github/workflows/aws_tfhe_fast_tests.yml
vendored
42
.github/workflows/aws_tfhe_fast_tests.yml
vendored
@@ -26,6 +26,7 @@ jobs:
|
||||
outputs:
|
||||
csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
|
||||
zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
|
||||
versionable_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.versionable_any_changed }}
|
||||
core_crypto_test: ${{ env.IS_PULL_REQUEST == 'false' ||
|
||||
steps.changed-files.outputs.core_crypto_any_changed ||
|
||||
steps.changed-files.outputs.dependencies_any_changed }}
|
||||
@@ -50,13 +51,13 @@ jobs:
|
||||
any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@cc733854b1f224978ef800d29e4709d5ee2883e4
|
||||
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -64,10 +65,15 @@ jobs:
|
||||
- tfhe/Cargo.toml
|
||||
- concrete-csprng/**
|
||||
- tfhe-zk-pok/**
|
||||
- utils/tfhe-versionable/**
|
||||
- utils/tfhe-versionable-derive/**
|
||||
csprng:
|
||||
- concrete-csprng/**
|
||||
zk_pok:
|
||||
- tfhe-zk-pok/**
|
||||
versionable:
|
||||
- utils/tfhe-versionable/**
|
||||
- utils/tfhe-versionable-derive/**
|
||||
core_crypto:
|
||||
- tfhe/src/core_crypto/**
|
||||
boolean:
|
||||
@@ -103,6 +109,7 @@ jobs:
|
||||
if: ( steps.changed-files.outputs.dependencies_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.csprng_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.versionable_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.boolean_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.shortint_any_changed == 'true' ||
|
||||
@@ -124,7 +131,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -144,23 +151,20 @@ jobs:
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Install git-lfs
|
||||
run: |
|
||||
sudo apt update && sudo apt -y install git-lfs
|
||||
|
||||
- name: Run concrete-csprng tests
|
||||
if: needs.should-run.outputs.csprng_test == 'true'
|
||||
run: |
|
||||
@@ -171,6 +175,11 @@ jobs:
|
||||
run: |
|
||||
make test_zk_pok
|
||||
|
||||
- name: Run tfhe-versionable tests
|
||||
if: needs.should-run.outputs.versionable_test == 'true'
|
||||
run: |
|
||||
make test_versionable
|
||||
|
||||
- name: Run core tests
|
||||
if: needs.should-run.outputs.core_crypto_test == 'true'
|
||||
run: |
|
||||
@@ -216,19 +225,8 @@ jobs:
|
||||
run: |
|
||||
make test_safe_deserialization
|
||||
|
||||
- name: Clone test data
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
repository: zama-ai/tfhe-backward-compat-data
|
||||
path: tfhe/tfhe-backward-compat-data
|
||||
lfs: 'true'
|
||||
|
||||
- name: Run backward compatibility tests
|
||||
run: |
|
||||
make test_backward_compatibility_ci
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
@@ -243,7 +241,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
62
.github/workflows/aws_tfhe_integer_tests.yml
vendored
62
.github/workflows/aws_tfhe_integer_tests.yml
vendored
@@ -19,27 +19,60 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
types: [labeled]
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
schedule:
|
||||
# Nightly tests @ 3AM after each work day
|
||||
- cron: "0 3 * * MON-FRI"
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
if:
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
|
||||
github.event_name == 'workflow_dispatch'
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
integer_test: ${{ github.event_name == 'workflow_dispatch' ||
|
||||
steps.changed-files.outputs.integer_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: "false"
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
integer:
|
||||
- tfhe/Cargo.toml
|
||||
- concrete-csprng/**
|
||||
- tfhe-zk-pok/**
|
||||
- tfhe/src/core_crypto/**
|
||||
- tfhe/src/shortint/**
|
||||
- tfhe/src/integer/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (unsigned-integer-tests)
|
||||
if: (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
|
||||
(github.event_name != 'push' && github.event_name != 'pull_request')
|
||||
needs: should-run
|
||||
if:
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
|
||||
github.event_name == 'workflow_dispatch'
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -57,16 +90,17 @@ jobs:
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
persist-credentials: "false"
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -92,7 +126,7 @@ jobs:
|
||||
AVX512_SUPPORT=ON NO_BIG_PARAMS=${{ env.NO_BIG_PARAMS }} BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_ci
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
@@ -102,12 +136,12 @@ jobs:
|
||||
teardown-instance:
|
||||
name: Teardown instance (unsigned-integer-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, unsigned-integer-tests ]
|
||||
needs: [setup-instance, unsigned-integer-tests]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -19,27 +19,60 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
types: [labeled]
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
schedule:
|
||||
# Nightly tests @ 3AM after each work day
|
||||
- cron: "0 3 * * MON-FRI"
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
if:
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
|
||||
github.event_name == 'workflow_dispatch'
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
integer_test: ${{ github.event_name == 'workflow_dispatch' ||
|
||||
steps.changed-files.outputs.integer_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: "false"
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
integer:
|
||||
- tfhe/Cargo.toml
|
||||
- concrete-csprng/**
|
||||
- tfhe-zk-pok/**
|
||||
- tfhe/src/core_crypto/**
|
||||
- tfhe/src/shortint/**
|
||||
- tfhe/src/integer/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (signed-integer-tests)
|
||||
if: (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
|
||||
(github.event_name != 'push' && github.event_name != 'pull_request')
|
||||
name: Setup instance (unsigned-integer-tests)
|
||||
needs: should-run
|
||||
if:
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
|
||||
github.event_name == 'workflow_dispatch'
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -57,16 +90,17 @@ jobs:
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
persist-credentials: "false"
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -96,7 +130,7 @@ jobs:
|
||||
AVX512_SUPPORT=ON NO_BIG_PARAMS=${{ env.NO_BIG_PARAMS }} BIG_TESTS_INSTANCE=TRUE make test_signed_integer_ci
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
@@ -106,12 +140,12 @@ jobs:
|
||||
teardown-instance:
|
||||
name: Teardown instance (signed-integer-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, signed-integer-tests ]
|
||||
needs: [setup-instance, signed-integer-tests]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
15
.github/workflows/aws_tfhe_tests.yml
vendored
15
.github/workflows/aws_tfhe_tests.yml
vendored
@@ -57,13 +57,13 @@ jobs:
|
||||
any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@cc733854b1f224978ef800d29e4709d5ee2883e4
|
||||
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -131,7 +131,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -151,16 +151,17 @@ jobs:
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -222,7 +223,7 @@ jobs:
|
||||
make test_kreyvium
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
@@ -237,7 +238,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
17
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
17
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
@@ -27,7 +27,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -45,22 +45,25 @@ jobs:
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Install Node
|
||||
- name: Install web resources
|
||||
run: |
|
||||
make install_node
|
||||
make install_chrome_browser
|
||||
make install_chrome_web_driver
|
||||
|
||||
- name: Run fmt checks
|
||||
run: |
|
||||
@@ -72,10 +75,10 @@ jobs:
|
||||
|
||||
- name: Run parallel wasm tests
|
||||
run: |
|
||||
make test_web_js_api_parallel_ci
|
||||
make test_web_js_api_parallel_chrome_ci
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
@@ -90,7 +93,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -29,7 +29,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -48,9 +48,10 @@ jobs:
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
@@ -66,7 +67,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -98,13 +99,13 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_boolean
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -113,16 +114,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -140,7 +133,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -26,7 +26,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -44,9 +44,10 @@ jobs:
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
@@ -62,7 +63,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -86,13 +87,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -101,16 +102,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on downloaded artifact"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -128,7 +121,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -1,5 +1,5 @@
|
||||
# Run all benchmarks on an RTX 4090 machine and return parsed results to Slab CI bot.
|
||||
name: TFHE Cuda Backend - 4090 full benchmarks
|
||||
# Run benchmarks on an RTX 4090 machine and return parsed results to Slab CI bot.
|
||||
name: TFHE Cuda Backend - 4090 benchmarks
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
@@ -11,6 +11,7 @@ env:
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
FAST_BENCH: TRUE
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
@@ -23,8 +24,10 @@ on:
|
||||
|
||||
jobs:
|
||||
cuda-integer-benchmarks:
|
||||
name: Cuda integer benchmarks for all operations flavor (RTX 4090)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
|
||||
name: Cuda integer benchmarks (RTX 4090)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' ||
|
||||
github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs' ||
|
||||
contains(github.event.label.name, '4090_bench') }}
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_cuda_integer_bench
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
@@ -33,15 +36,13 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: [integer, integer_multi_bit]
|
||||
op_flavor: [default, unchecked]
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
@@ -50,14 +51,15 @@ jobs:
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "FAST_BENCH=TRUE" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -65,7 +67,7 @@ jobs:
|
||||
|
||||
- name: Run integer benchmarks
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
|
||||
make BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -81,27 +83,19 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
name: ${{ github.sha }}_integer_multi_bit_gpu_default
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
@@ -120,7 +114,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -133,18 +127,18 @@ jobs:
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Run integer benchmarks
|
||||
- name: Run core crypto benchmarks
|
||||
run: |
|
||||
make bench_pbs_gpu
|
||||
make bench_ks_gpu
|
||||
@@ -163,7 +157,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -183,7 +177,7 @@ jobs:
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
@@ -27,7 +27,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -65,9 +65,10 @@ jobs:
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
@@ -83,7 +84,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -128,13 +129,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -143,16 +144,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on downloaded artifact"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
@@ -175,7 +168,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -23,14 +23,14 @@ jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-integer-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'push' ||
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -68,9 +68,10 @@ jobs:
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
@@ -86,7 +87,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -110,6 +111,10 @@ jobs:
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu
|
||||
@@ -120,7 +125,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -140,13 +145,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -155,22 +160,14 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-integer-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
if: ${{ always() && needs.cuda-integer-benchmarks.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
@@ -187,7 +184,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
194
.github/workflows/benchmark_gpu_integer_2H100_full.yml
vendored
Normal file
194
.github/workflows/benchmark_gpu_integer_2H100_full.yml
vendored
Normal file
@@ -0,0 +1,194 @@
|
||||
# Run integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
|
||||
name: Integer 2xH100 benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
- cron: '0 1 * * 6'
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-integer-full-2-gpu-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'schedule' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: 2-h100
|
||||
|
||||
cuda-integer-full-2-gpu-benchmarks:
|
||||
name: Execute 2xH100 integer benchmarks
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
continue-on-error: true
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: [integer_multi_bit]
|
||||
op_flavor: [default]
|
||||
# explicit include-based build matrix, of known valid options
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CUDA_PATH=$CUDA_PATH";
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "n3-H100x2" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-integer-full-2-gpu-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-integer-full-2-gpu-benchmarks.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cuda-integer-full-2-gpu-benchmarks.result }}
|
||||
SLACK_MESSAGE: "Integer GPU 2xH100 benchmarks finished with status: ${{ needs.cuda-integer-full-2-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-integer-full-2-gpu-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-integer-full-2-gpu-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-integer-full-2-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
@@ -29,7 +29,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -49,7 +49,7 @@ jobs:
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: [integer, integer_multi_bit]
|
||||
op_flavor: [default, unchecked]
|
||||
op_flavor: [default]
|
||||
# explicit include-based build matrix, of known valid options
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
@@ -72,9 +72,10 @@ jobs:
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
@@ -90,7 +91,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -115,16 +116,26 @@ jobs:
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
|
||||
|
||||
# Run these benchmarks only once
|
||||
- name: Run compression benchmarks with AVX512
|
||||
if: matrix.op_flavor == 'default' && matrix.command == 'integer'
|
||||
run: |
|
||||
make bench_integer_compression_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
@@ -140,7 +151,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -148,22 +159,14 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-integer-full-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
if: ${{ always() && needs.cuda-integer-full-benchmarks.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
@@ -180,7 +183,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -42,7 +42,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -81,9 +81,10 @@ jobs:
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
@@ -99,7 +100,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -133,6 +134,10 @@ jobs:
|
||||
run: |
|
||||
echo "BENCH_OP_FLAVOR=fast_default" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run multi-bit benchmarks with AVX512
|
||||
run: |
|
||||
make bench_unsigned_integer_multi_bit_gpu
|
||||
@@ -143,7 +148,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -163,13 +168,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -178,23 +183,14 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-integer-multi-bit-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
if: ${{ always() && needs.cuda-integer-multi-bit-benchmarks.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
@@ -211,7 +207,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -34,20 +34,22 @@ jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-integer-multi-bit-multi-gpu-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
|
||||
if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
github.event_name == 'workflow_dispatch' }}
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
profile: multi-gpu-test
|
||||
backend: hyperstack
|
||||
profile: multi-h100
|
||||
|
||||
cuda-integer-multi-bit-multi-gpu-benchmarks:
|
||||
name: Execute multi GPU integer multi-bit benchmarks
|
||||
@@ -62,15 +64,28 @@ jobs:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
@@ -86,7 +101,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -111,7 +126,7 @@ jobs:
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -127,6 +142,10 @@ jobs:
|
||||
run: |
|
||||
echo "BENCH_OP_FLAVOR=fast_default" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run multi-bit benchmarks with AVX512
|
||||
run: |
|
||||
make bench_unsigned_integer_multi_bit_gpu
|
||||
@@ -135,7 +154,7 @@ jobs:
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "p3.8xlarge" \
|
||||
--hardware "n3-H100x8" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
@@ -146,7 +165,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -154,22 +173,14 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-integer-multi-bit-multi-gpu-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
if: ${{ always() && needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
@@ -186,7 +197,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -29,17 +29,17 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
profile: multi-gpu-test
|
||||
backend: hyperstack
|
||||
profile: multi-h100
|
||||
|
||||
cuda-integer-full-multi-gpu-benchmarks:
|
||||
name: Execute multi GPU integer benchmarks for all operations flavor
|
||||
name: Execute multi GPU integer benchmarks
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
@@ -48,21 +48,34 @@ jobs:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: [integer, integer_multi_bit]
|
||||
op_flavor: [default, unchecked]
|
||||
command: [integer_multi_bit]
|
||||
op_flavor: [default]
|
||||
# explicit include-based build matrix, of known valid options
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
@@ -78,7 +91,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
@@ -103,12 +116,16 @@ jobs:
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
|
||||
@@ -117,7 +134,7 @@ jobs:
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "p3.8xlarge" \
|
||||
--hardware "n3-H100x8" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
@@ -128,7 +145,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -136,22 +153,14 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-integer-full-multi-gpu-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
if: ${{ always() && needs.cuda-integer-full-multi-gpu-benchmarks.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
@@ -168,7 +177,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
206
.github/workflows/benchmark_gpu_l40.yml
vendored
Normal file
206
.github/workflows/benchmark_gpu_l40.yml
vendored
Normal file
@@ -0,0 +1,206 @@
|
||||
# Run benchmarks on an L40 VM and return parsed results to Slab CI bot.
|
||||
name: Cuda benchmarks (L40)
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 1a.m.
|
||||
- cron: '0 1 * * 6'
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-l40-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'schedule' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: l40
|
||||
|
||||
cuda-l40-benchmarks:
|
||||
name: Cuda benchmarks (L40)
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
continue-on-error: true
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: [integer_multi_bit]
|
||||
op_flavor: [default]
|
||||
# explicit include-based build matrix, of known valid options
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CUDA_PATH=$CUDA_PATH";
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
|
||||
|
||||
- name: Run compression benchmarks with AVX512
|
||||
run: |
|
||||
make bench_integer_compression_gpu
|
||||
|
||||
- name: Run PBS benchmarks
|
||||
run: |
|
||||
make bench_pbs_gpu
|
||||
|
||||
- name: Run KS benchmarks
|
||||
run: |
|
||||
make bench_ks_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "n3-L40x1" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-l40-benchmarks ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-l40-benchmarks.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cuda-l40-benchmarks.result }}
|
||||
SLACK_MESSAGE: "Cuda benchmarks (L40) finished with status: ${{ needs.cuda-l40-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-l40-benchmarks)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-l40-benchmarks, slack-notify ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-l40-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
@@ -62,7 +62,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -87,9 +87,10 @@ jobs:
|
||||
op_flavor: ${{ fromJson(needs.prepare-matrix.outputs.op_flavor) }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
@@ -105,12 +106,12 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -125,6 +126,12 @@ jobs:
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}
|
||||
|
||||
# Run these benchmarks only once
|
||||
- name: Run compression benchmarks with AVX512
|
||||
if: matrix.op_flavor == 'default' && matrix.command == 'integer'
|
||||
run: |
|
||||
make bench_integer_compression
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
@@ -139,7 +146,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -147,16 +154,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -174,7 +173,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -56,7 +56,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -79,9 +79,10 @@ jobs:
|
||||
op_flavor: ${{ fromJson(needs.prepare-matrix.outputs.op_flavor) }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
@@ -97,12 +98,12 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -141,7 +142,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -149,16 +150,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -176,7 +169,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -62,7 +62,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -87,9 +87,10 @@ jobs:
|
||||
op_flavor: [ default, unchecked ]
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
@@ -105,12 +106,12 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -139,7 +140,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -147,16 +148,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -174,7 +167,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -25,20 +25,21 @@ jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
((github.event_name == 'push' || github.event_name == 'schedule') && github.repository == 'zama-ai/tfhe-rs')
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
wasm_bench: ${{ steps.changed-files.outputs.wasm_bench_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@cc733854b1f224978ef800d29e4709d5ee2883e4
|
||||
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -53,7 +54,8 @@ jobs:
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (wasm-client-benchmarks)
|
||||
if: github.event_name != 'push' ||
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.wasm_bench)
|
||||
needs: should-run
|
||||
runs-on: ubuntu-latest
|
||||
@@ -62,7 +64,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -73,15 +75,15 @@ jobs:
|
||||
|
||||
wasm-client-benchmarks:
|
||||
name: Execute WASM client benchmarks
|
||||
needs: [ should-run, setup-instance ]
|
||||
if: github.event_name != 'push' ||
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.wasm_bench)
|
||||
needs: setup-instance
|
||||
if: needs.setup-instance.result != 'skipped'
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
@@ -97,14 +99,19 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Run benchmarks
|
||||
- name: Install web resources
|
||||
run: |
|
||||
make install_node
|
||||
make bench_web_js_api_parallel_ci
|
||||
make install_chrome_browser
|
||||
make install_chrome_web_driver
|
||||
|
||||
- name: Run benchmarks
|
||||
run: |
|
||||
make bench_web_js_api_parallel_chrome_ci
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -129,13 +136,13 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_wasm
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -144,16 +151,8 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
@@ -171,7 +170,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -24,19 +24,19 @@ env:
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'push' ||
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
((github.event_name == 'push' || github.event_name == 'schedule') && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
zk_pok_changed: ${{ steps.changed-files.outputs.zk_pok_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@cc733854b1f224978ef800d29e4709d5ee2883e4
|
||||
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -55,7 +55,7 @@ jobs:
|
||||
name: Setup instance (pke-zk-benchmarks)
|
||||
runs-on: ubuntu-latest
|
||||
needs: should-run
|
||||
if: github.event_name != 'push' ||
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'push' &&
|
||||
github.repository == 'zama-ai/tfhe-rs' &&
|
||||
@@ -65,7 +65,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -76,19 +76,18 @@ jobs:
|
||||
|
||||
pke-zk-benchmarks:
|
||||
name: Execute PKE ZK benchmarks
|
||||
if: github.event_name != 'push' ||
|
||||
((github.event_name == 'push' || github.event_name == 'schedule') &&
|
||||
needs.setup-instance.result != 'skipped')
|
||||
needs: [ should-run, setup-instance ]
|
||||
if: needs.setup-instance.result != 'skipped'
|
||||
needs: setup-instance
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
|
||||
group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
@@ -104,12 +103,12 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -140,13 +139,13 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_integer_zk
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -155,19 +154,11 @@ jobs:
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
|
||||
--slab-url "${{ secrets.SLAB_URL }}"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
@@ -182,7 +173,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
21
.github/workflows/cargo_build.yml
vendored
21
.github/workflows/cargo_build.yml
vendored
@@ -19,14 +19,21 @@ jobs:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
os: [large_ubuntu_16, macos-latest-large, large_windows_16_latest]
|
||||
# GitHub macos-latest are now M1 macs, so use ours, we limit what runs so it will be fast
|
||||
# even with a few PRs
|
||||
os: [large_ubuntu_16, macos-latest, windows-latest]
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
- uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Install and run newline linter checks
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
wget https://github.com/fernandrone/linelint/releases/download/0.0.6/linelint-linux-amd64
|
||||
echo "16b70fb7b471d6f95cbdc0b4e5dc2b0ac9e84ba9ecdc488f7bdf13df823aca4b linelint-linux-amd64" > checksum
|
||||
@@ -36,27 +43,33 @@ jobs:
|
||||
make check_newline
|
||||
|
||||
- name: Run pcc checks
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make pcc
|
||||
|
||||
- name: Build concrete-csprng
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_concrete_csprng
|
||||
|
||||
- name: Build Release core
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_core AVX512_SUPPORT=ON
|
||||
make build_core_experimental AVX512_SUPPORT=ON
|
||||
|
||||
- name: Build Release boolean
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_boolean
|
||||
|
||||
- name: Build Release shortint
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_shortint
|
||||
|
||||
- name: Build Release integer
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_integer
|
||||
|
||||
@@ -65,10 +78,12 @@ jobs:
|
||||
make build_tfhe_full
|
||||
|
||||
- name: Build Release c_api
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_c_api
|
||||
|
||||
- name: Build coverage tests
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_tfhe_coverage
|
||||
|
||||
|
||||
8
.github/workflows/ci_lint.yml
vendored
8
.github/workflows/ci_lint.yml
vendored
@@ -13,7 +13,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
|
||||
- name: Get actionlint
|
||||
run: |
|
||||
@@ -25,3 +25,9 @@ jobs:
|
||||
- name: Lint workflows
|
||||
run: |
|
||||
make lint_workflow
|
||||
|
||||
- name: Ensure SHA pinned actions
|
||||
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@0901cf7b71c7ea6261ec69a3dc2bd3f9264f893e # v3.0.12
|
||||
with:
|
||||
allowlist: |
|
||||
slsa-framework/slsa-github-generator
|
||||
|
||||
10
.github/workflows/code_coverage.yml
vendored
10
.github/workflows/code_coverage.yml
vendored
@@ -25,7 +25,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -44,20 +44,20 @@ jobs:
|
||||
timeout-minutes: 5760 # 4 days
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@cc733854b1f224978ef800d29e4709d5ee2883e4
|
||||
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
|
||||
with:
|
||||
files_yaml: |
|
||||
tfhe:
|
||||
@@ -125,7 +125,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -27,7 +27,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -45,16 +45,17 @@ jobs:
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -78,7 +79,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
123
.github/workflows/data_pr_close.yml
vendored
Normal file
123
.github/workflows/data_pr_close.yml
vendored
Normal file
@@ -0,0 +1,123 @@
|
||||
name: Close or Merge corresponding PR on the data repo
|
||||
|
||||
# When a PR with the data_PR tag is closed or merged, this will close the corresponding PR in the data repo.
|
||||
|
||||
env:
|
||||
TARGET_REPO_API_URL: ${{ github.api_url }}/repos/zama-ai/tfhe-backward-compat-data
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
PR_BRANCH: ${{ github.head_ref || github.ref_name }}
|
||||
CLOSE_TYPE: ${{ github.event.pull_request.merged && 'merge' || 'close' }}
|
||||
|
||||
# only trigger on pull request closed events
|
||||
on:
|
||||
pull_request:
|
||||
types: [ closed ]
|
||||
|
||||
# The same pattern is used for jobs that use the github api:
|
||||
# - save the result of the API call in the env var "GH_API_RES". Since the var is multiline
|
||||
# we use this trick: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#example-of-a-multiline-string
|
||||
# - "set +e" will make sure we reach the last "echo EOF" even in case of error
|
||||
# - "set -o" pipefail makes one line piped command return the error of the first failure
|
||||
# - 'RES="$?"' and 'exit $RES' are used to return the error code if a command failed. Without it, with "set +e"
|
||||
# the script will always return 0 because of the "echo EOF".
|
||||
|
||||
|
||||
jobs:
|
||||
auto_close_job:
|
||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Find corresponding Pull Request in the data repo
|
||||
run: |
|
||||
{
|
||||
set +e
|
||||
set -o pipefail
|
||||
echo 'TARGET_REPO_PR<<EOF'
|
||||
curl --fail-with-body --no-progress-meter -L -X GET \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
${{ env.TARGET_REPO_API_URL }}/pulls\?head=${{ github.repository_owner }}:${{ env.PR_BRANCH }} | jq -e '.[0]' | sed 's/null/{ "message": "corresponding PR not found" }/'
|
||||
RES="$?"
|
||||
echo EOF
|
||||
} >> "${GITHUB_ENV}"
|
||||
exit $RES
|
||||
|
||||
- name: Comment on the PR to indicate the reason of the close
|
||||
run: |
|
||||
{
|
||||
set +e
|
||||
set -o pipefail
|
||||
echo 'GH_API_RES<<EOF'
|
||||
curl --fail-with-body --no-progress-meter -L -X POST \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
${{ fromJson(env.TARGET_REPO_PR).comments_url }} \
|
||||
-d '{ "body": "PR ${{ env.CLOSE_TYPE }}d because the corresponding PR in main repo was ${{ env.CLOSE_TYPE }}d: ${{ github.repository }}#${{ github.event.number }}" }'
|
||||
RES="$?"
|
||||
echo EOF
|
||||
} >> "${GITHUB_ENV}"
|
||||
exit $RES
|
||||
|
||||
- name: Merge the Pull Request in the data repo
|
||||
if: ${{ github.event.pull_request.merged }}
|
||||
run: |
|
||||
{
|
||||
set +e
|
||||
set -o pipefail
|
||||
echo 'GH_API_RES<<EOF'
|
||||
curl --fail-with-body --no-progress-meter -L -X PUT \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
${{ fromJson(env.TARGET_REPO_PR).url }}/merge \
|
||||
-d '{ "merge_method": "rebase" }'
|
||||
RES="$?"
|
||||
echo EOF
|
||||
} >> "${GITHUB_ENV}"
|
||||
exit $RES
|
||||
|
||||
- name: Close the Pull Request in the data repo
|
||||
if: ${{ !github.event.pull_request.merged }}
|
||||
run: |
|
||||
{
|
||||
set +e
|
||||
set -o pipefail
|
||||
echo 'GH_API_RES<<EOF'
|
||||
curl --fail-with-body --no-progress-meter -L -X PATCH \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
${{ fromJson(env.TARGET_REPO_PR).url }} \
|
||||
-d '{ "state": "closed" }'
|
||||
RES="$?"
|
||||
echo EOF
|
||||
} >> "${GITHUB_ENV}"
|
||||
exit $RES
|
||||
|
||||
- name: Delete the associated branch in the data repo
|
||||
run: |
|
||||
{
|
||||
set +e
|
||||
set -o pipefail
|
||||
echo 'GH_API_RES<<EOF'
|
||||
curl --fail-with-body --no-progress-meter -L -X DELETE \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
${{ env.TARGET_REPO_API_URL }}/git/refs/heads/${{ env.PR_BRANCH }}
|
||||
RES="$?"
|
||||
echo EOF
|
||||
} >> "${GITHUB_ENV}"
|
||||
exit $RES
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() && job.status == 'failure' }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Failed to auto-${{ env.CLOSE_TYPE }} PR on data repo: ${{ fromJson(env.GH_API_RES || env.TARGET_REPO_PR).message }}"
|
||||
@@ -34,12 +34,13 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -74,7 +75,7 @@ jobs:
|
||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
201
.github/workflows/gpu_fast_h100_tests.yml
vendored
Normal file
201
.github/workflows/gpu_fast_h100_tests.yml
vendored
Normal file
@@ -0,0 +1,201 @@
|
||||
# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
|
||||
name: TFHE Cuda Backend - Fast tests on H100
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
gpu:
|
||||
- tfhe/Cargo.toml
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**.md'
|
||||
- Makefile
|
||||
- '.github/workflows/gpu_fast_h100_tests.yml'
|
||||
- scripts/**
|
||||
- ci/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-h100-tests)
|
||||
needs: should-run
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
cuda-tests-linux:
|
||||
name: CUDA H100 tests
|
||||
needs: [ should-run, setup-instance ]
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run core crypto and internal CUDA backend tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
|
||||
BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
|
||||
BIG_TESTS_INSTANCE=TRUE make test_cuda_backend
|
||||
|
||||
- name: Run user docs tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
|
||||
|
||||
- name: Test C API
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
|
||||
|
||||
- name: Run High Level API Tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
|
||||
SLACK_MESSAGE: "Fast H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-h100-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
@@ -27,13 +27,13 @@ jobs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@cc733854b1f224978ef800d29e4709d5ee2883e4
|
||||
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -43,10 +43,14 @@ jobs:
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/shortint/parameters/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**.md'
|
||||
- '.github/workflows/gpu_fast_tests.yml'
|
||||
- Makefile
|
||||
- scripts/**
|
||||
- ci/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-tests)
|
||||
@@ -59,13 +63,13 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
backend: hyperstack
|
||||
profile: gpu-test
|
||||
|
||||
cuda-tests-linux:
|
||||
@@ -84,22 +88,35 @@ jobs:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -122,9 +139,14 @@ jobs:
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run core crypto and internal CUDA backend tests
|
||||
run: |
|
||||
make test_core_crypto_gpu
|
||||
make test_integer_compression_gpu
|
||||
make test_cuda_backend
|
||||
|
||||
- name: Run user docs tests
|
||||
@@ -139,13 +161,18 @@ jobs:
|
||||
run: |
|
||||
make test_high_level_api_gpu
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "CUDA AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
|
||||
SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-tests)
|
||||
@@ -155,7 +182,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
156
.github/workflows/gpu_full_h100_tests.yml
vendored
Normal file
156
.github/workflows/gpu_full_h100_tests.yml
vendored
Normal file
@@ -0,0 +1,156 @@
|
||||
# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
|
||||
name: TFHE Cuda Backend - Full tests on H100
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-h100-tests)
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
cuda-tests-linux:
|
||||
name: CUDA H100 tests
|
||||
needs: [ setup-instance ]
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run core crypto, integer and internal CUDA backend tests
|
||||
run: |
|
||||
make test_gpu
|
||||
|
||||
- name: Run user docs tests
|
||||
run: |
|
||||
make test_user_doc_gpu
|
||||
|
||||
- name: Test C API
|
||||
run: |
|
||||
make test_c_api_gpu
|
||||
|
||||
- name: Run High Level API Tests
|
||||
run: |
|
||||
make test_high_level_api_gpu
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
|
||||
SLACK_MESSAGE: "Full H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-h100-tests)
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
@@ -28,13 +28,13 @@ jobs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@cc733854b1f224978ef800d29e4709d5ee2883e4
|
||||
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -44,10 +44,14 @@ jobs:
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/shortint/parameters/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**.md'
|
||||
- Makefile
|
||||
- '.github/workflows/**_multi_gpu_tests.yml'
|
||||
- scripts/**
|
||||
- ci/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-tests-multi-gpu)
|
||||
@@ -61,13 +65,13 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
backend: hyperstack
|
||||
profile: multi-gpu-test
|
||||
|
||||
cuda-tests-linux:
|
||||
@@ -86,20 +90,35 @@ jobs:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -122,30 +141,43 @@ jobs:
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run multi-bit CUDA integer compression tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
|
||||
|
||||
# No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
|
||||
- name: Run multi-bit CUDA integer tests
|
||||
run: |
|
||||
make test_integer_multi_bit_gpu_ci
|
||||
BIG_TESTS_INSTANCE=TRUE make test_integer_multi_bit_gpu_ci
|
||||
|
||||
- name: Run user docs tests
|
||||
run: |
|
||||
make test_user_doc_gpu
|
||||
BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
|
||||
|
||||
- name: Test C API
|
||||
run: |
|
||||
make test_c_api_gpu
|
||||
BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
|
||||
|
||||
- name: Run High Level API Tests
|
||||
run: |
|
||||
make test_high_level_api_gpu
|
||||
BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "CUDA AWS multi-GPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
|
||||
SLACK_MESSAGE: "Multi-GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-tests-multi-gpu)
|
||||
@@ -155,7 +187,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -24,7 +24,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -53,16 +53,17 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -94,7 +95,7 @@ jobs:
|
||||
make pcc_gpu
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
@@ -109,7 +110,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -1,5 +1,5 @@
|
||||
# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
|
||||
name: TFHE Cuda Backend - Full tests on H100
|
||||
# Signed integer GPU tests on an H100 VM on hyperstack
|
||||
name: TFHE Cuda Backend - Signed integer tests on H100
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
@@ -28,13 +28,14 @@ jobs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@cc733854b1f224978ef800d29e4709d5ee2883e4
|
||||
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -44,10 +45,14 @@ jobs:
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/shortint/parameters/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**.md'
|
||||
- Makefile
|
||||
- '.github/workflows/gpu_signed_integer_h100_tests.yml'
|
||||
- scripts/**
|
||||
- ci/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-h100-tests)
|
||||
@@ -61,7 +66,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -71,7 +76,7 @@ jobs:
|
||||
profile: single-h100
|
||||
|
||||
cuda-tests-linux:
|
||||
name: CUDA H100 tests
|
||||
name: CUDA H100 signed integer tests
|
||||
needs: [ should-run, setup-instance ]
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
|
||||
@@ -104,14 +109,14 @@ jobs:
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -134,40 +139,23 @@ jobs:
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name:
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run core crypto and internal CUDA backend tests
|
||||
- name: Run signed integer tests
|
||||
run: |
|
||||
make test_core_crypto_gpu
|
||||
make test_cuda_backend
|
||||
BIG_TESTS_INSTANCE=TRUE make test_signed_integer_gpu_ci
|
||||
|
||||
- name: Run integer tests
|
||||
- name: Run signed integer multi-bit tests
|
||||
run: |
|
||||
make test_integer_gpu_ci
|
||||
|
||||
- name: Run integer multi-bit tests
|
||||
run: |
|
||||
make test_integer_multi_bit_gpu_ci
|
||||
|
||||
- name: Run user docs tests
|
||||
run: |
|
||||
make test_user_doc_gpu
|
||||
|
||||
- name: Test C API
|
||||
run: |
|
||||
make test_c_api_gpu
|
||||
|
||||
- name: Run High Level API Tests
|
||||
run: |
|
||||
make test_high_level_api_gpu
|
||||
BIG_TESTS_INSTANCE=TRUE make test_signed_integer_multi_bit_gpu_ci
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
|
||||
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
@@ -184,7 +172,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -36,13 +36,13 @@ jobs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@cc733854b1f224978ef800d29e4709d5ee2883e4
|
||||
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -52,30 +52,34 @@ jobs:
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/shortint/parameters/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**.md'
|
||||
- '.github/workflows/gpu_signed_integer_tests.yml'
|
||||
- Makefile
|
||||
- scripts/**
|
||||
- ci/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-signed-integer-tests)
|
||||
runs-on: ubuntu-latest
|
||||
needs: should-run
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
if: (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
github.event_name == 'workflow_dispatch' ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true')
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
backend: hyperstack
|
||||
profile: gpu-test
|
||||
|
||||
cuda-signed-integer-tests:
|
||||
@@ -94,20 +98,35 @@ jobs:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -138,22 +157,26 @@ jobs:
|
||||
echo "NIGHTLY_TESTS=TRUE";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run signed integer tests
|
||||
if: github.event_name != 'pull_request' || contains(github.event.label.name, 'approved')
|
||||
run: |
|
||||
make test_signed_integer_gpu_ci
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run signed integer multi-bit tests
|
||||
run: |
|
||||
make test_signed_integer_multi_bit_gpu_ci
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-signed-integer-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-signed-integer-tests.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "CUDA AWS signed integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_COLOR: ${{ needs.cuda-signed-integer-tests.result }}
|
||||
SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-signed-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-tests)
|
||||
@@ -163,7 +186,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
189
.github/workflows/gpu_unsigned_integer_h100_tests.yml
vendored
Normal file
189
.github/workflows/gpu_unsigned_integer_h100_tests.yml
vendored
Normal file
@@ -0,0 +1,189 @@
|
||||
# Test unsigned integers on an H100 VM on hyperstack
|
||||
name: TFHE Cuda Backend - Unsigned integer tests on H100
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
gpu:
|
||||
- tfhe/Cargo.toml
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**.md'
|
||||
- Makefile
|
||||
- '.github/workflows/gpu_unsigned_integer_tests.yml'
|
||||
- scripts/**
|
||||
- ci/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-h100-tests)
|
||||
needs: should-run
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
cuda-tests-linux:
|
||||
name: CUDA H100 unsigned integer tests
|
||||
needs: [ should-run, setup-instance ]
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run unsigned integer tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_gpu_ci
|
||||
|
||||
- name: Run unsigned integer multi-bit tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_multi_bit_gpu_ci
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
|
||||
SLACK_MESSAGE: "Unsigned integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-h100-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
label: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
@@ -35,13 +35,14 @@ jobs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@cc733854b1f224978ef800d29e4709d5ee2883e4
|
||||
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -51,30 +52,34 @@ jobs:
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/shortint/parameters/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**.md'
|
||||
- '.github/workflows/gpu_unsigned_integer_tests.yml'
|
||||
- Makefile
|
||||
- scripts/**
|
||||
- ci/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (cuda-unsigned-integer-tests)
|
||||
needs: should-run
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
if: (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
github.event_name == 'workflow_dispatch' ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
backend: hyperstack
|
||||
profile: gpu-test
|
||||
|
||||
cuda-unsigned-integer-tests:
|
||||
@@ -93,20 +98,32 @@ jobs:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
gcc: 11
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
|
||||
CMAKE_VERSION: 3.29.6
|
||||
steps:
|
||||
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y checkinstall zlib1g-dev libssl-dev
|
||||
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
|
||||
cd cmake-${{ env.CMAKE_VERSION }}
|
||||
./bootstrap
|
||||
make -j"$(nproc)"
|
||||
sudo make install
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -137,22 +154,26 @@ jobs:
|
||||
echo "NIGHTLY_TESTS=TRUE";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run unsigned integer tests
|
||||
if: github.event_name != 'pull_request' || contains(github.event.label.name, 'approved')
|
||||
run: |
|
||||
make test_unsigned_integer_gpu_ci
|
||||
- name: Check device is detected
|
||||
if: ${{ !cancelled() }}
|
||||
run: nvidia-smi
|
||||
|
||||
- name: Run unsigned integer multi-bit tests
|
||||
run: |
|
||||
make test_unsigned_integer_multi_bit_gpu_ci
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
needs: [ setup-instance, cuda-unsigned-integer-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ always() && needs.cuda-unsigned-integer-tests.result != 'skipped' && failure() }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Send message
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "CUDA AWS unsigned integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_COLOR: ${{ needs.cuda-unsigned-integer-tests.result }}
|
||||
SLACK_MESSAGE: "Unsigned integer GPU tests finished with status: ${{ needs.cuda-unsigned-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: Teardown instance (cuda-tests)
|
||||
@@ -162,7 +183,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
4
.github/workflows/m1_tests.yml
vendored
4
.github/workflows/m1_tests.yml
vendored
@@ -34,12 +34,12 @@ jobs:
|
||||
timeout-minutes: 720
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
- uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
|
||||
8
.github/workflows/make_release.yml
vendored
8
.github/workflows/make_release.yml
vendored
@@ -36,13 +36,13 @@ jobs:
|
||||
hash: ${{ steps.hash.outputs.hash }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Prepare package
|
||||
run: |
|
||||
cargo package -p tfhe
|
||||
- uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3
|
||||
- uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
|
||||
with:
|
||||
name: crate
|
||||
path: target/package/*.crate
|
||||
@@ -74,7 +74,7 @@ jobs:
|
||||
id-token: write
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Create NPM version tag
|
||||
@@ -82,7 +82,7 @@ jobs:
|
||||
run: |
|
||||
echo "NPM_TAG=latest" >> "${GITHUB_ENV}"
|
||||
- name: Download artifact
|
||||
uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
|
||||
uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
|
||||
with:
|
||||
name: crate
|
||||
path: target/package
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
# Publish new release of tfhe-rs on various platform.
|
||||
name: Publish concrete-csprng release
|
||||
|
||||
on:
|
||||
@@ -18,7 +17,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -37,6 +36,6 @@ jobs:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "concrete-csprng release failed: (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "concrete-csprng release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
36
.github/workflows/make_release_concrete_tfhe_versionable.yml
vendored
Normal file
36
.github/workflows/make_release_concrete_tfhe_versionable.yml
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
name: Publish tfhe-versionable release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
|
||||
jobs:
|
||||
publish_release:
|
||||
name: Publish tfhe-versionable Release
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Publish crate.io package
|
||||
env:
|
||||
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
|
||||
run: |
|
||||
cargo publish -p tfhe-versionable-derive --token ${{ env.CRATES_TOKEN }}
|
||||
cargo publish -p tfhe-versionable --token ${{ env.CRATES_TOKEN }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "tfhe-versionable release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
10
.github/workflows/make_release_cuda.yml
vendored
10
.github/workflows/make_release_cuda.yml
vendored
@@ -29,14 +29,14 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
backend: aws
|
||||
profile: gpu-test
|
||||
profile: gpu-build
|
||||
|
||||
publish-cuda-release:
|
||||
name: Publish CUDA Release
|
||||
@@ -54,7 +54,7 @@ jobs:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
@@ -112,7 +112,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
|
||||
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
2
.github/workflows/make_release_zk_pok.yml
vendored
2
.github/workflows/make_release_zk_pok.yml
vendored
@@ -18,7 +18,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
|
||||
8
.github/workflows/parameters_check.yml
vendored
8
.github/workflows/parameters_check.yml
vendored
@@ -14,17 +14,17 @@ on:
|
||||
|
||||
jobs:
|
||||
params-curves-security-check:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: large_ubuntu_16
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
|
||||
- name: Checkout lattice-estimator
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
repository: malb/lattice-estimator
|
||||
path: lattice_estimator
|
||||
ref: '53508253629d3b5d31a2ad110e85dc69391ccb95'
|
||||
ref: 'e80ec6bbbba212428b0e92d0467c18629cf9ed67'
|
||||
|
||||
- name: Install Sage
|
||||
run: |
|
||||
|
||||
2
.github/workflows/sync_on_push.yml
vendored
2
.github/workflows/sync_on_push.yml
vendored
@@ -13,7 +13,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: git-sync
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -26,6 +26,8 @@ backends/tfhe-cuda-backend/cuda/cmake-build-debug/
|
||||
|
||||
# WASM tests
|
||||
tfhe/web_wasm_parallel_tests/server.PID
|
||||
venv/
|
||||
web-test-runner/
|
||||
|
||||
# Dir used for backward compatibility test data
|
||||
tfhe/tfhe-backward-compat-data/
|
||||
|
||||
178
Makefile
178
Makefile
@@ -18,22 +18,15 @@ FAST_TESTS?=FALSE
|
||||
FAST_BENCH?=FALSE
|
||||
NIGHTLY_TESTS?=FALSE
|
||||
BENCH_OP_FLAVOR?=DEFAULT
|
||||
NODE_VERSION=22.4
|
||||
NODE_VERSION=22.6
|
||||
FORWARD_COMPAT?=OFF
|
||||
BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
|
||||
BACKWARD_COMPAT_DATA_BRANCH?=v0.1
|
||||
BACKWARD_COMPAT_DATA_BRANCH?=v0.2
|
||||
BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
|
||||
BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
|
||||
# sed: -n, do not print input stream, -e means a script/expression
|
||||
# 1,/version/ indicates from the first line, to the line matching version at the start of the line
|
||||
# p indicates to print, so we keep only the start of the Cargo.toml until we hit the first version
|
||||
# entry which should be the version of tfhe
|
||||
TFHE_CURRENT_VERSION:=\
|
||||
$(shell sed -n -e '1,/^version/p' tfhe/Cargo.toml | \
|
||||
grep '^version[[:space:]]*=' | cut -d '=' -f 2 | xargs)
|
||||
# Cargo has a hard time distinguishing between our package from the workspace and a package that
|
||||
# could be a dependency, so we build an unambiguous spec here
|
||||
TFHE_SPEC:=tfhe@$(TFHE_CURRENT_VERSION)
|
||||
TFHE_SPEC:=tfhe
|
||||
WEB_RUNNER_DIR=web-test-runner
|
||||
WEB_SERVER_DIR=tfhe/web_wasm_parallel_tests
|
||||
# This is done to avoid forgetting it, we still precise the RUSTFLAGS in the commands to be able to
|
||||
# copy paste the command in the terminal and change them if required without forgetting the flags
|
||||
export RUSTFLAGS?=-C target-cpu=native
|
||||
@@ -155,6 +148,43 @@ install_tfhe_lints:
|
||||
(cd utils/cargo-tfhe-lints-inner && cargo install --path .) && \
|
||||
cd utils/cargo-tfhe-lints && cargo install --path .
|
||||
|
||||
.PHONY: install_typos_checker # Install typos checker
|
||||
install_typos_checker: install_rs_build_toolchain
|
||||
@typos --version > /dev/null 2>&1 || \
|
||||
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install typos-cli || \
|
||||
( echo "Unable to install typos-cli, unknown error." && exit 1 )
|
||||
|
||||
.PHONY: setup_venv # Setup Python virtualenv for wasm tests
|
||||
setup_venv:
|
||||
python3 -m venv venv
|
||||
@source venv/bin/activate && \
|
||||
pip3 install -r ci/webdriver_requirements.txt
|
||||
|
||||
# This is an internal target, not meant to be called on its own.
|
||||
install_web_resource:
|
||||
wget -P $(dest) $(url)
|
||||
@cd $(dest) && \
|
||||
echo "$(checksum) $(filename)" > checksum && \
|
||||
sha256sum -c checksum && \
|
||||
rm checksum && \
|
||||
unzip $(filename)
|
||||
|
||||
install_chrome_browser: url = "https://storage.googleapis.com/chrome-for-testing-public/128.0.6613.137/linux64/chrome-linux64.zip"
|
||||
install_chrome_browser: checksum = "c5d7da679f3a353ae4e4420ab113de06d4bd459152f5b17558390c02d9520566"
|
||||
install_chrome_browser: dest = "$(WEB_RUNNER_DIR)/chrome"
|
||||
install_chrome_browser: filename = "chrome-linux64.zip"
|
||||
|
||||
.PHONY: install_chrome_browser # Install Chrome browser for Linux
|
||||
install_chrome_browser: install_web_resource
|
||||
|
||||
install_chrome_web_driver: url = "https://storage.googleapis.com/chrome-for-testing-public/128.0.6613.137/linux64/chromedriver-linux64.zip"
|
||||
install_chrome_web_driver: checksum = "f041092f403fb7455a6da2871070b6587c32814a3e3c2b0a794d3d4aa4739151"
|
||||
install_chrome_web_driver: dest = "$(WEB_RUNNER_DIR)/chrome"
|
||||
install_chrome_web_driver: filename = "chromedriver-linux64.zip"
|
||||
|
||||
.PHONY: install_chrome_web_driver # Install Chrome web driver for Linux
|
||||
install_chrome_web_driver: install_web_resource
|
||||
|
||||
.PHONY: check_linelint_installed # Check if linelint newline linter is installed
|
||||
check_linelint_installed:
|
||||
@printf "\n" | linelint - > /dev/null 2>&1 || \
|
||||
@@ -216,6 +246,10 @@ check_fmt_js: check_nvm_installed
|
||||
nvm use $(NODE_VERSION) && \
|
||||
$(MAKE) -C tfhe/web_wasm_parallel_tests check_fmt
|
||||
|
||||
.PHONY: check_typos # Check for typos in codebase
|
||||
check_typos: install_typos_checker
|
||||
@typos && echo "No typos found"
|
||||
|
||||
.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
|
||||
clippy_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
@@ -223,6 +257,13 @@ clippy_gpu: install_rs_check_toolchain
|
||||
--all-targets \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: check_gpu # Run check on tfhe with "gpu" enabled
|
||||
check_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu \
|
||||
--all-targets \
|
||||
-p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
|
||||
fix_newline: check_linelint_installed
|
||||
linelint -a .
|
||||
@@ -261,12 +302,18 @@ clippy_shortint: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,experimental \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_integer # Run clippy lints enabling the integer features
|
||||
clippy_integer: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,experimental \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy # Run clippy lints enabling the boolean, shortint, integer
|
||||
clippy: install_rs_check_toolchain
|
||||
@@ -293,6 +340,9 @@ clippy_c_api: install_rs_check_toolchain
|
||||
|
||||
.PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
|
||||
clippy_js_wasm_api: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
@@ -312,6 +362,9 @@ clippy_all_targets: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok,experimental \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
|
||||
clippy_concrete_csprng: install_rs_check_toolchain
|
||||
@@ -324,9 +377,17 @@ clippy_zk_pok: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-zk-pok -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_versionable # Run clippy lints on tfhe-versionable
|
||||
clippy_versionable: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-versionable-derive -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-versionable -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_all # Run all clippy targets
|
||||
clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
|
||||
clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium
|
||||
clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium \
|
||||
clippy_versionable
|
||||
|
||||
.PHONY: clippy_fast # Run main clippy targets
|
||||
clippy_fast: clippy_rustdoc clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks \
|
||||
@@ -340,7 +401,7 @@ clippy_cuda_backend: install_rs_check_toolchain
|
||||
.PHONY: tfhe_lints # Run custom tfhe-rs lints
|
||||
tfhe_lints: install_tfhe_lints
|
||||
cd tfhe && RUSTFLAGS="$(RUSTFLAGS)" cargo tfhe-lints \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -- -D warnings
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,zk-pok -- -D warnings
|
||||
|
||||
.PHONY: build_core # Build core_crypto without experimental features
|
||||
build_core: install_rs_build_toolchain install_rs_check_toolchain
|
||||
@@ -428,6 +489,7 @@ build_web_js_api_parallel: install_rs_check_toolchain install_wasm_pack
|
||||
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,parallel-wasm-api,zk-pok \
|
||||
-Z build-std=panic_abort,std && \
|
||||
find pkg/snippets -type f -iname workerHelpers.worker.js -exec sed -i "s|from '..\/..\/..\/';|from '..\/..\/..\/tfhe.js';|" {} \;
|
||||
jq '.files += ["snippets"]' tfhe/pkg/package.json > tmp_pkg.json && mv -f tmp_pkg.json tfhe/pkg/package.json
|
||||
|
||||
.PHONY: build_node_js_api # Build the js API targeting nodejs
|
||||
build_node_js_api: install_rs_build_toolchain install_wasm_pack
|
||||
@@ -490,6 +552,13 @@ test_integer_gpu: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
|
||||
|
||||
.PHONY: test_integer_compression_gpu
|
||||
test_integer_compression_gpu: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compressed_ciphertext_list::tests::
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compress
|
||||
|
||||
.PHONY: test_integer_gpu_ci # Run the tests for integer ci on gpu backend
|
||||
test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
@@ -743,7 +812,7 @@ test_zk_pok: install_rs_build_toolchain
|
||||
.PHONY: test_versionable # Run tests for tfhe-versionable subcrate
|
||||
test_versionable: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
-p tfhe-versionable
|
||||
--all-targets -p tfhe-versionable
|
||||
|
||||
# The backward compat data repo holds historical binary data but also rust code to generate and load them.
|
||||
# Here we use the "patch" functionality of Cargo to make sure the repo used for the data is the same as the one used for the code.
|
||||
@@ -751,11 +820,15 @@ test_versionable: install_rs_build_toolchain
|
||||
test_backward_compatibility_ci: install_rs_build_toolchain
|
||||
TFHE_BACKWARD_COMPAT_DATA_DIR="$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).path=\"tfhe/$(BACKWARD_COMPAT_DATA_DIR)\"" \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,integer -p $(TFHE_SPEC) test_backward_compatibility -- --nocapture
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,integer,zk-pok -p $(TFHE_SPEC) test_backward_compatibility -- --nocapture
|
||||
|
||||
.PHONY: test_backward_compatibility # Same as test_backward_compatibility_ci but tries to clone the data repo first if needed
|
||||
test_backward_compatibility: tfhe/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci
|
||||
|
||||
.PHONY: backward_compat_branch # Prints the required backward compatibility branch
|
||||
backward_compat_branch:
|
||||
@echo "$(BACKWARD_COMPAT_DATA_BRANCH)"
|
||||
|
||||
.PHONY: doc # Build rust doc
|
||||
doc: install_rs_check_toolchain
|
||||
@# Even though we are not in docs.rs, this allows to "just" build the doc
|
||||
@@ -838,18 +911,35 @@ test_nodejs_wasm_api_in_docker: build_nodejs_test_docker
|
||||
|
||||
.PHONY: test_nodejs_wasm_api # Run tests for the nodejs on wasm API
|
||||
test_nodejs_wasm_api: build_node_js_api
|
||||
cd tfhe/js_on_wasm_tests && npm run test
|
||||
cd tfhe/js_on_wasm_tests && npm install && npm run test
|
||||
|
||||
.PHONY: test_web_js_api_parallel # Run tests for the web wasm api
|
||||
test_web_js_api_parallel: build_web_js_api_parallel
|
||||
$(MAKE) -C tfhe/web_wasm_parallel_tests test
|
||||
|
||||
.PHONY: test_web_js_api_parallel_ci # Run tests for the web wasm api
|
||||
test_web_js_api_parallel_ci: build_web_js_api_parallel
|
||||
# This is an internal target, not meant to be called on its own.
|
||||
run_web_js_api_parallel: build_web_js_api_parallel setup_venv
|
||||
cd $(WEB_SERVER_DIR) && npm install && npm run build
|
||||
source venv/bin/activate && \
|
||||
python ci/webdriver.py \
|
||||
--browser-path $(browser_path) \
|
||||
--driver-path $(driver_path) \
|
||||
--browser-kind $(browser_kind) \
|
||||
--server-cmd "npm run server" \
|
||||
--server-workdir "$(WEB_SERVER_DIR)" \
|
||||
--id-pattern $(filter)
|
||||
|
||||
test_web_js_api_parallel_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
|
||||
test_web_js_api_parallel_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
|
||||
test_web_js_api_parallel_chrome: browser_kind = chrome
|
||||
test_web_js_api_parallel_chrome: filter = Test
|
||||
|
||||
.PHONY: test_web_js_api_parallel_chrome # Run tests for the web wasm api
|
||||
test_web_js_api_parallel_chrome: run_web_js_api_parallel
|
||||
|
||||
.PHONY: test_web_js_api_parallel_chrome_ci # Run tests for the web wasm api
|
||||
test_web_js_api_parallel_chrome_ci: setup_venv
|
||||
source ~/.nvm/nvm.sh && \
|
||||
nvm install $(NODE_VERSION) && \
|
||||
nvm use $(NODE_VERSION) && \
|
||||
$(MAKE) -C tfhe/web_wasm_parallel_tests test-ci
|
||||
$(MAKE) test_web_js_api_parallel_chrome
|
||||
|
||||
.PHONY: no_tfhe_typo # Check we did not invert the h and f in tfhe
|
||||
no_tfhe_typo:
|
||||
@@ -867,6 +957,11 @@ dieharder_csprng: install_dieharder build_concrete_csprng
|
||||
# Benchmarks
|
||||
#
|
||||
|
||||
.PHONY: print_doc_bench_parameters # Print parameters used in doc benchmarks
|
||||
print_doc_bench_parameters:
|
||||
RUSTFLAGS="" cargo run --example print_doc_bench_parameters \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache -p tfhe
|
||||
|
||||
.PHONY: bench_integer # Run benchmarks for unsigned integer
|
||||
bench_integer: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
@@ -888,6 +983,18 @@ bench_integer_gpu: install_rs_check_toolchain
|
||||
--bench integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
|
||||
bench_integer_compression: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench glwe_packing_compression-integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_compression_gpu
|
||||
bench_integer_compression_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench glwe_packing_compression-integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
|
||||
bench_integer_multi_bit: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
|
||||
@@ -970,7 +1077,7 @@ bench_pbs128: install_rs_check_toolchain
|
||||
|
||||
.PHONY: bench_pbs_gpu # Run benchmarks for PBS on GPU backend
|
||||
bench_pbs_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_FAST_BENCH=$(FAST_BENCH) cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench pbs-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
|
||||
@@ -986,15 +1093,20 @@ bench_ks_gpu: install_rs_check_toolchain
|
||||
--bench ks-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: bench_web_js_api_parallel # Run benchmarks for the web wasm api
|
||||
bench_web_js_api_parallel: build_web_js_api_parallel
|
||||
$(MAKE) -C tfhe/web_wasm_parallel_tests bench
|
||||
bench_web_js_api_parallel_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
|
||||
bench_web_js_api_parallel_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
|
||||
bench_web_js_api_parallel_chrome: browser_kind = chrome
|
||||
bench_web_js_api_parallel_chrome: filter = Bench
|
||||
|
||||
.PHONY: bench_web_js_api_parallel_ci # Run benchmarks for the web wasm api
|
||||
bench_web_js_api_parallel_ci: build_web_js_api_parallel
|
||||
.PHONY: bench_web_js_api_parallel_chrome # Run benchmarks for the web wasm api
|
||||
bench_web_js_api_parallel_chrome: run_web_js_api_parallel
|
||||
|
||||
.PHONY: bench_web_js_api_parallel_chrome_ci # Run benchmarks for the web wasm api
|
||||
bench_web_js_api_parallel_chrome_ci: setup_venv
|
||||
source ~/.nvm/nvm.sh && \
|
||||
nvm install $(NODE_VERSION) && \
|
||||
nvm use $(NODE_VERSION) && \
|
||||
$(MAKE) -C tfhe/web_wasm_parallel_tests bench-ci
|
||||
$(MAKE) bench_web_js_api_parallel_chrome
|
||||
|
||||
#
|
||||
# Utility tools
|
||||
@@ -1042,7 +1154,7 @@ parse_wasm_benchmarks: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
|
||||
--example wasm_benchmarks_parser \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache \
|
||||
-- web_wasm_parallel_tests/test/benchmark_results
|
||||
-- wasm_benchmark_results.json
|
||||
|
||||
.PHONY: write_params_to_file # Gather all crypto parameters into a file with a Sage readable format.
|
||||
write_params_to_file: install_rs_check_toolchain
|
||||
@@ -1081,14 +1193,14 @@ sha256_bool: install_rs_check_toolchain
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean
|
||||
|
||||
.PHONY: pcc # pcc stands for pre commit checks (except GPU)
|
||||
pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested check_intra_md_links \
|
||||
pcc: no_tfhe_typo no_dbg_log check_fmt check_typos lint_doc check_md_docs_are_tested check_intra_md_links \
|
||||
clippy_all tfhe_lints check_compile_tests
|
||||
|
||||
.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
|
||||
pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu
|
||||
|
||||
.PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
|
||||
fpcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested clippy_fast \
|
||||
fpcc: no_tfhe_typo no_dbg_log check_fmt check_typos lint_doc check_md_docs_are_tested clippy_fast \
|
||||
check_compile_tests
|
||||
|
||||
.PHONY: conformance # Automatically fix problems that can be fixed
|
||||
|
||||
@@ -159,7 +159,7 @@ To run this code, use the following command:
|
||||
> Note that when running code that uses `TFHE-rs`, it is highly recommended
|
||||
to run in release mode with cargo's `--release` flag to have the best performances possible.
|
||||
|
||||
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/getting-started/quick_start)*
|
||||
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick_start)*
|
||||
|
||||
<p align="right">
|
||||
<a href="#about" > ↑ Back to top </a>
|
||||
|
||||
15
_typos.toml
Normal file
15
_typos.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
[default]
|
||||
extend-ignore-identifiers-re = [
|
||||
# Related to serialized object
|
||||
"ser",
|
||||
"unser",
|
||||
# Used when dumping tfhe-rs parameters set into Sage format
|
||||
"ND.*",
|
||||
# Related to FHE strings example handling "banana"
|
||||
"ba",
|
||||
"enc_ba",
|
||||
# Example with string replacing "hello" with "herlo"
|
||||
"herlo",
|
||||
# Example in trivium
|
||||
"C9217BA0D762ACA1"
|
||||
]
|
||||
@@ -4,9 +4,8 @@ use tfhe::{generate_keys, ConfigBuilder, FheUint64, FheUint8};
|
||||
use tfhe_trivium::{KreyviumStreamByte, TransCiphering};
|
||||
|
||||
pub fn kreyvium_byte_gen(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::default()
|
||||
.enable_function_evaluation()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
|
||||
@@ -33,9 +32,8 @@ pub fn kreyvium_byte_gen(c: &mut Criterion) {
|
||||
}
|
||||
|
||||
pub fn kreyvium_byte_trans(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::default()
|
||||
.enable_function_evaluation()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
|
||||
@@ -63,9 +61,8 @@ pub fn kreyvium_byte_trans(c: &mut Criterion) {
|
||||
}
|
||||
|
||||
pub fn kreyvium_byte_warmup(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::default()
|
||||
.enable_function_evaluation()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
|
||||
|
||||
@@ -148,10 +148,9 @@ where
|
||||
|
||||
/// Computes one turn of the stream, updating registers and outputting the new bit.
|
||||
pub fn next_bool(&mut self) -> T {
|
||||
match &self.fhe_key {
|
||||
Some(sk) => set_server_key(sk.clone()),
|
||||
None => (),
|
||||
};
|
||||
if let Some(sk) = &self.fhe_key {
|
||||
set_server_key(sk.clone());
|
||||
}
|
||||
|
||||
let [o, a, b, c] = self.get_output_and_values(0);
|
||||
|
||||
@@ -226,18 +225,12 @@ where
|
||||
/// Computes 64 turns of the stream, outputting the 64 bits all at once in a
|
||||
/// Vec (first value is oldest, last is newest)
|
||||
pub fn next_64(&mut self) -> Vec<T> {
|
||||
match &self.fhe_key {
|
||||
Some(sk) => {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
None => (),
|
||||
if let Some(sk) = &self.fhe_key {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
let mut values = self.get_64_output_and_values();
|
||||
match &self.fhe_key {
|
||||
Some(_) => {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
None => (),
|
||||
if self.fhe_key.is_some() {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
|
||||
let mut ret = Vec::<T>::with_capacity(64);
|
||||
|
||||
@@ -119,7 +119,7 @@ impl KreyviumStreamByte<FheUint8> {
|
||||
}
|
||||
|
||||
// Key and iv are stored in reverse in their shift registers
|
||||
let mut key = key_bytes.map(|b| b.map(|x| (x as u8).reverse_bits() as u64));
|
||||
let mut key = key_bytes.map(|b| b.reverse_bits());
|
||||
let mut iv = iv_bytes.map(|x| FheUint8::encrypt_trivial(x.reverse_bits()));
|
||||
key.reverse();
|
||||
iv.reverse();
|
||||
@@ -237,18 +237,12 @@ where
|
||||
/// Computes 64 turns of the stream, outputting the 64 bits (in 8 bytes) all at once in a
|
||||
/// Vec (first value is oldest, last is newest)
|
||||
pub fn next_64(&mut self) -> Vec<T> {
|
||||
match &self.fhe_key {
|
||||
Some(sk) => {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
None => (),
|
||||
if let Some(sk) = &self.fhe_key {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
let values = self.get_64_output_and_values();
|
||||
match &self.fhe_key {
|
||||
Some(_) => {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
None => (),
|
||||
if self.fhe_key.is_some() {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
|
||||
let mut bytes = Vec::<T>::with_capacity(8);
|
||||
|
||||
@@ -299,9 +299,8 @@ fn kreyvium_test_clear_byte() {
|
||||
|
||||
#[test]
|
||||
fn kreyvium_test_byte_long() {
|
||||
let config = ConfigBuilder::default()
|
||||
.enable_function_evaluation()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
|
||||
@@ -338,9 +337,8 @@ fn kreyvium_test_byte_long() {
|
||||
|
||||
#[test]
|
||||
fn kreyvium_test_fhe_byte_transciphering_long() {
|
||||
let config = ConfigBuilder::default()
|
||||
.enable_function_evaluation()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#![allow(clippy::too_long_first_doc_paragraph)]
|
||||
|
||||
mod static_deque;
|
||||
|
||||
mod kreyvium;
|
||||
|
||||
@@ -120,10 +120,9 @@ where
|
||||
|
||||
/// Computes one turn of the stream, updating registers and outputting the new bit.
|
||||
pub fn next_bool(&mut self) -> T {
|
||||
match &self.fhe_key {
|
||||
Some(sk) => set_server_key(sk.clone()),
|
||||
None => (),
|
||||
};
|
||||
if let Some(sk) = &self.fhe_key {
|
||||
set_server_key(sk.clone());
|
||||
}
|
||||
|
||||
let [o, a, b, c] = self.get_output_and_values(0);
|
||||
|
||||
@@ -196,18 +195,12 @@ where
|
||||
/// Computes 64 turns of the stream, outputting the 64 bits all at once in a
|
||||
/// Vec (first value is oldest, last is newest)
|
||||
pub fn next_64(&mut self) -> Vec<T> {
|
||||
match &self.fhe_key {
|
||||
Some(sk) => {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
None => (),
|
||||
if let Some(sk) = &self.fhe_key {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
let mut values = self.get_64_output_and_values();
|
||||
match &self.fhe_key {
|
||||
Some(_) => {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
None => (),
|
||||
if self.fhe_key.is_some() {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
|
||||
let mut ret = Vec::<T>::with_capacity(64);
|
||||
|
||||
@@ -187,18 +187,12 @@ where
|
||||
/// Computes 64 turns of the stream, outputting the 64 bits (in 8 bytes) all at once in a
|
||||
/// Vec (first value is oldest, last is newest)
|
||||
pub fn next_64(&mut self) -> Vec<T> {
|
||||
match &self.fhe_key {
|
||||
Some(sk) => {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
None => (),
|
||||
if let Some(sk) = &self.fhe_key {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
let values = self.get_64_output_and_values();
|
||||
match &self.fhe_key {
|
||||
Some(_) => {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
None => (),
|
||||
if self.fhe_key.is_some() {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
|
||||
let mut bytes = Vec::<T>::with_capacity(8);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tfhe-cuda-backend"
|
||||
version = "0.4.0-alpha.0"
|
||||
version = "0.4.0"
|
||||
edition = "2021"
|
||||
authors = ["Zama team"]
|
||||
license = "BSD-3-Clause-Clear"
|
||||
|
||||
@@ -67,9 +67,21 @@ endif()
|
||||
|
||||
add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})
|
||||
|
||||
# Check if the DEBUG flag is defined
|
||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||
# Debug mode
|
||||
message("Compiling in Debug mode")
|
||||
add_definitions(-DDEBUG)
|
||||
set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -G -g")
|
||||
else()
|
||||
# Release mode
|
||||
message("Compiling in Release mode")
|
||||
set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O3")
|
||||
endif()
|
||||
|
||||
# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
|
||||
set(CMAKE_CUDA_FLAGS
|
||||
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 \
|
||||
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
|
||||
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true \
|
||||
--use_fast_math -Xcompiler -fPIC")
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#ifndef CUDA_CIPHERTEXT_H
|
||||
#define CUDA_CIPHERTEXT_H
|
||||
|
||||
#include "device.h"
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
@@ -14,5 +15,11 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
|
||||
void *dest, void *src,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension);
|
||||
|
||||
void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out, void *glwe_array_in,
|
||||
uint32_t *nth_array, uint32_t num_nths,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size);
|
||||
};
|
||||
#endif
|
||||
|
||||
163
backends/tfhe-cuda-backend/cuda/include/compression.h
Normal file
163
backends/tfhe-cuda-backend/cuda/include/compression.h
Normal file
@@ -0,0 +1,163 @@
|
||||
#ifndef CUDA_INTEGER_COMPRESSION_H
|
||||
#define CUDA_INTEGER_COMPRESSION_H
|
||||
|
||||
#include "integer.h"
|
||||
|
||||
extern "C" {
|
||||
void scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_compress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
|
||||
int8_t *mem_ptr);
|
||||
|
||||
void cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *glwe_in, uint32_t *indexes_array,
|
||||
uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr);
|
||||
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cleanup_cuda_integer_decompress_radix_ciphertext_64(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
}
|
||||
|
||||
template <typename Torus> struct int_compression {
|
||||
int_radix_params compression_params;
|
||||
uint32_t storage_log_modulus;
|
||||
uint32_t lwe_per_glwe;
|
||||
|
||||
uint32_t body_count;
|
||||
|
||||
// Compression
|
||||
int8_t *fp_ks_buffer;
|
||||
Torus *tmp_lwe;
|
||||
Torus *tmp_glwe_array_out;
|
||||
|
||||
int_compression(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params compression_params,
|
||||
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
|
||||
this->compression_params = compression_params;
|
||||
this->lwe_per_glwe = lwe_per_glwe;
|
||||
this->storage_log_modulus = storage_log_modulus;
|
||||
this->body_count = num_radix_blocks;
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size;
|
||||
|
||||
tmp_lwe = (Torus *)cuda_malloc_async(
|
||||
num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
tmp_glwe_array_out = (Torus *)cuda_malloc_async(
|
||||
lwe_per_glwe * glwe_accumulator_size * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
scratch_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
streams[0], gpu_indexes[0], &fp_ks_buffer,
|
||||
compression_params.glwe_dimension, compression_params.polynomial_size,
|
||||
num_radix_blocks, true);
|
||||
}
|
||||
}
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_async(tmp_lwe, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(tmp_glwe_array_out, streams[0], gpu_indexes[0]);
|
||||
cleanup_packing_keyswitch_lwe_list_to_glwe(streams[0], gpu_indexes[0],
|
||||
&fp_ks_buffer);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_decompression {
|
||||
int_radix_params encryption_params;
|
||||
int_radix_params compression_params;
|
||||
|
||||
uint32_t storage_log_modulus;
|
||||
|
||||
uint32_t num_radix_blocks;
|
||||
uint32_t body_count;
|
||||
|
||||
Torus *tmp_extracted_glwe;
|
||||
Torus *tmp_extracted_lwe;
|
||||
uint32_t *tmp_indexes_array;
|
||||
|
||||
int_radix_lut<Torus> *carry_extract_lut;
|
||||
|
||||
int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params encryption_params,
|
||||
int_radix_params compression_params,
|
||||
uint32_t num_radix_blocks, uint32_t body_count,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
|
||||
this->encryption_params = encryption_params;
|
||||
this->compression_params = compression_params;
|
||||
this->storage_log_modulus = storage_log_modulus;
|
||||
this->num_radix_blocks = num_radix_blocks;
|
||||
this->body_count = body_count;
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size;
|
||||
Torus lwe_accumulator_size = (compression_params.glwe_dimension *
|
||||
compression_params.polynomial_size +
|
||||
1);
|
||||
carry_extract_lut = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, encryption_params, 1,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
|
||||
tmp_extracted_glwe = (Torus *)cuda_malloc_async(
|
||||
num_radix_blocks * glwe_accumulator_size * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
tmp_indexes_array = (uint32_t *)cuda_malloc_async(
|
||||
num_radix_blocks * sizeof(uint32_t), streams[0], gpu_indexes[0]);
|
||||
tmp_extracted_lwe = (Torus *)cuda_malloc_async(
|
||||
num_radix_blocks * lwe_accumulator_size * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
// Carry extract LUT
|
||||
auto carry_extract_f = [encryption_params](Torus x) -> Torus {
|
||||
return x / encryption_params.message_modulus;
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
carry_extract_lut->get_lut(gpu_indexes[0], 0),
|
||||
encryption_params.glwe_dimension, encryption_params.polynomial_size,
|
||||
encryption_params.message_modulus, encryption_params.carry_modulus,
|
||||
carry_extract_f);
|
||||
|
||||
carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
}
|
||||
}
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(tmp_indexes_array, streams[0], gpu_indexes[0]);
|
||||
|
||||
carry_extract_lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete carry_extract_lut;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
@@ -39,16 +39,15 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
bool cuda_check_support_cooperative_groups();
|
||||
|
||||
bool cuda_check_support_thread_block_clusters();
|
||||
|
||||
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
uint32_t gpu_index);
|
||||
|
||||
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
@@ -62,9 +61,13 @@ void cuda_synchronize_device(uint32_t gpu_index);
|
||||
void cuda_drop(void *ptr, uint32_t gpu_index);
|
||||
|
||||
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
|
||||
}
|
||||
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index);
|
||||
}
|
||||
|
||||
bool cuda_check_support_cooperative_groups();
|
||||
|
||||
bool cuda_check_support_thread_block_clusters();
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
|
||||
@@ -1,14 +1,30 @@
|
||||
#ifndef HELPER_MULTI_GPU_H
|
||||
#define HELPER_MULTI_GPU_H
|
||||
#include <mutex>
|
||||
#include <variant>
|
||||
#include <vector>
|
||||
|
||||
extern std::mutex m;
|
||||
extern bool p2p_enabled;
|
||||
|
||||
extern "C" {
|
||||
int cuda_setup_multi_gpu();
|
||||
int32_t cuda_setup_multi_gpu();
|
||||
}
|
||||
|
||||
// Define a variant type that can be either a vector or a single pointer
|
||||
template <typename Torus>
|
||||
using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
|
||||
|
||||
// Macro to define the visitor logic using std::holds_alternative for vectors
|
||||
#define GET_VARIANT_ELEMENT(variant, index) \
|
||||
[&] { \
|
||||
if (std::holds_alternative<std::vector<Torus *>>(variant)) { \
|
||||
return std::get<std::vector<Torus *>>(variant)[index]; \
|
||||
} else { \
|
||||
return std::get<Torus *>(variant); \
|
||||
} \
|
||||
}()
|
||||
|
||||
int get_active_gpu_count(int num_inputs, int gpu_count);
|
||||
|
||||
int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#ifndef CUDA_INTEGER_H
|
||||
#define CUDA_INTEGER_H
|
||||
|
||||
#include "keyswitch.h"
|
||||
#include "pbs/programmable_bootstrap.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "programmable_bootstrap_multibit.h"
|
||||
@@ -15,7 +16,6 @@ enum SHIFT_OR_ROTATE_TYPE {
|
||||
LEFT_ROTATE = 2,
|
||||
RIGHT_ROTATE = 3
|
||||
};
|
||||
enum LUT_TYPE { OPERATOR = 0, MAXVALUE = 1, ISNONZERO = 2, BLOCKSLEN = 3 };
|
||||
enum BITOP_TYPE {
|
||||
BITAND = 0,
|
||||
BITOR = 1,
|
||||
@@ -80,13 +80,17 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cuda_apply_many_univariate_lut_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
|
||||
void **bsks, uint32_t num_blocks, uint32_t num_luts, uint32_t lut_stride);
|
||||
|
||||
void scratch_cuda_full_propagation_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_full_propagation_64_inplace(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, void *input_blocks,
|
||||
@@ -102,7 +106,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
|
||||
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory);
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -113,10 +117,11 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int8_t **mem_ptr_void);
|
||||
|
||||
void cuda_negate_integer_radix_ciphertext_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus);
|
||||
void cuda_negate_integer_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus);
|
||||
|
||||
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
@@ -284,7 +289,7 @@ void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
@@ -293,15 +298,14 @@ void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
|
||||
int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix);
|
||||
|
||||
void cleanup_cuda_integer_radix_sum_ciphertexts_vec(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_radix_overflowing_sub_kb_64(
|
||||
void **stream, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
@@ -376,6 +380,30 @@ void cleanup_signed_overflowing_add_or_sub(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
|
||||
void **ksks, void **bsks, uint32_t num_blocks, uint32_t shift);
|
||||
|
||||
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cuda_integer_reverse_blocks_64_inplace(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t num_blocks,
|
||||
uint32_t lwe_size);
|
||||
|
||||
} // extern C
|
||||
|
||||
template <typename Torus>
|
||||
@@ -453,7 +481,8 @@ struct int_radix_params {
|
||||
message_modulus(message_modulus), carry_modulus(carry_modulus){};
|
||||
|
||||
void print() {
|
||||
printf("pbs_type: %u, glwe_dimension: %u, polynomial_size: %u, "
|
||||
printf("pbs_type: %u, glwe_dimension: %u, "
|
||||
"polynomial_size: %u, "
|
||||
"big_lwe_dimension: %u, "
|
||||
"small_lwe_dimension: %u, ks_level: %u, ks_base_log: %u, pbs_level: "
|
||||
"%u, pbs_base_log: "
|
||||
@@ -487,11 +516,21 @@ template <typename Torus> struct int_radix_lut {
|
||||
// for the moment
|
||||
Torus *lwe_indexes_in;
|
||||
Torus *lwe_indexes_out;
|
||||
Torus *h_lwe_indexes_in;
|
||||
Torus *h_lwe_indexes_out;
|
||||
// Enable optimizations if lwe_indexes_(in/out) are trivial
|
||||
bool using_trivial_lwe_indexes = true;
|
||||
// lwe_trivial_indexes is the intermediary index we need in case
|
||||
// lwe_indexes_in != lwe_indexes_out
|
||||
Torus *lwe_trivial_indexes;
|
||||
Torus *tmp_lwe_before_ks;
|
||||
Torus *tmp_lwe_after_ks;
|
||||
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
std::vector<Torus *> lwe_array_in_vec;
|
||||
std::vector<Torus *> lwe_after_ks_vec;
|
||||
std::vector<Torus *> lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec;
|
||||
|
||||
int_radix_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
|
||||
@@ -511,13 +550,12 @@ template <typename Torus> struct int_radix_lut {
|
||||
cudaSetDevice(i);
|
||||
int8_t *gpu_pbs_buffer;
|
||||
auto num_blocks_on_gpu =
|
||||
get_num_inputs_on_gpu(num_radix_blocks, i, gpu_count);
|
||||
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
|
||||
|
||||
execute_scratch_pbs<Torus>(
|
||||
streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension,
|
||||
params.small_lwe_dimension, params.polynomial_size, params.pbs_level,
|
||||
params.grouping_factor, num_blocks_on_gpu,
|
||||
cuda_get_max_shared_memory(gpu_indexes[i]), params.pbs_type,
|
||||
params.grouping_factor, num_blocks_on_gpu, params.pbs_type,
|
||||
allocate_gpu_memory);
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
buffer.push_back(gpu_pbs_buffer);
|
||||
@@ -551,20 +589,43 @@ template <typename Torus> struct int_radix_lut {
|
||||
num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
lwe_trivial_indexes = (Torus *)cuda_malloc_async(
|
||||
num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
|
||||
|
||||
h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
|
||||
h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
|
||||
|
||||
for (int i = 0; i < num_radix_blocks; i++)
|
||||
h_lwe_indexes[i] = i;
|
||||
h_lwe_indexes_in[i] = i;
|
||||
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes,
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes,
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
|
||||
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus));
|
||||
|
||||
/// With multiple GPUs we allocate arrays to be pushed to the vectors and
|
||||
/// copy data on each GPU then when we gather data to GPU 0 we can copy
|
||||
/// back to the original indexing
|
||||
multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_array_in_vec, num_radix_blocks,
|
||||
params.big_lwe_dimension + 1);
|
||||
multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_after_ks_vec, num_radix_blocks,
|
||||
params.small_lwe_dimension + 1);
|
||||
multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_after_pbs_vec, num_radix_blocks,
|
||||
params.big_lwe_dimension + 1);
|
||||
multi_gpu_alloc_array_async(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_trivial_indexes_vec, num_radix_blocks);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
multi_gpu_copy_array_async(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_trivial_indexes_vec, lwe_trivial_indexes,
|
||||
num_radix_blocks);
|
||||
|
||||
// Keyswitch
|
||||
Torus big_size =
|
||||
@@ -573,10 +634,6 @@ template <typename Torus> struct int_radix_lut {
|
||||
(params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
|
||||
tmp_lwe_before_ks =
|
||||
(Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
|
||||
tmp_lwe_after_ks =
|
||||
(Torus *)cuda_malloc_async(small_size, streams[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
free(h_lwe_indexes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -598,7 +655,14 @@ template <typename Torus> struct int_radix_lut {
|
||||
buffer = base_lut_object->buffer;
|
||||
// Keyswitch
|
||||
tmp_lwe_before_ks = base_lut_object->tmp_lwe_before_ks;
|
||||
tmp_lwe_after_ks = base_lut_object->tmp_lwe_after_ks;
|
||||
|
||||
/// With multiple GPUs we allocate arrays to be pushed to the vectors and
|
||||
/// copy data on each GPU then when we gather data to GPU 0 we can copy back
|
||||
/// to the original indexing
|
||||
lwe_array_in_vec = base_lut_object->lwe_array_in_vec;
|
||||
lwe_after_ks_vec = base_lut_object->lwe_after_ks_vec;
|
||||
lwe_after_pbs_vec = base_lut_object->lwe_after_pbs_vec;
|
||||
lwe_trivial_indexes_vec = base_lut_object->lwe_trivial_indexes_vec;
|
||||
|
||||
mem_reuse = true;
|
||||
|
||||
@@ -630,22 +694,24 @@ template <typename Torus> struct int_radix_lut {
|
||||
num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
lwe_trivial_indexes = (Torus *)cuda_malloc_async(
|
||||
num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
|
||||
|
||||
h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
|
||||
h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
|
||||
|
||||
for (int i = 0; i < num_radix_blocks; i++)
|
||||
h_lwe_indexes[i] = i;
|
||||
h_lwe_indexes_in[i] = i;
|
||||
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes,
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes,
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
|
||||
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
free(h_lwe_indexes);
|
||||
memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus));
|
||||
}
|
||||
|
||||
// Return a pointer to idx-ith lut at gpu_index's global memory
|
||||
@@ -663,6 +729,22 @@ template <typename Torus> struct int_radix_lut {
|
||||
return &lut_indexes[ind];
|
||||
}
|
||||
|
||||
// If this function is called we assume the lwe_indexes_(in/out) are not the
|
||||
// trivial anymore and thus we disable optimizations
|
||||
void set_lwe_indexes(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *h_indexes_in, Torus *h_indexes_out) {
|
||||
|
||||
memcpy(h_lwe_indexes_in, h_indexes_in, num_blocks * sizeof(Torus));
|
||||
memcpy(h_lwe_indexes_out, h_indexes_out, num_blocks * sizeof(Torus));
|
||||
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
|
||||
num_blocks * sizeof(Torus), stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_out,
|
||||
num_blocks * sizeof(Torus), stream, gpu_index);
|
||||
|
||||
using_trivial_lwe_indexes = false;
|
||||
}
|
||||
|
||||
// Broadcast luts from gpu src_gpu_idx to all active gpus
|
||||
void broadcast_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t src_gpu_idx) {
|
||||
@@ -672,7 +754,6 @@ template <typename Torus> struct int_radix_lut {
|
||||
auto src_lut_indexes = lut_indexes_vec[src_gpu_idx];
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
#pragma omp parallel for num_threads(active_gpu_count)
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
if (i != src_gpu_idx) {
|
||||
auto dst_lut = lut_vec[i];
|
||||
@@ -690,7 +771,6 @@ template <typename Torus> struct int_radix_lut {
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
#pragma omp parallel for num_threads(active_gpu_count)
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_drop_async(lut_vec[i], streams[i], gpu_indexes[i]);
|
||||
cuda_drop_async(lut_indexes_vec[i], streams[i], gpu_indexes[i]);
|
||||
@@ -701,9 +781,13 @@ template <typename Torus> struct int_radix_lut {
|
||||
cuda_drop_async(lwe_indexes_in, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(lwe_indexes_out, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(lwe_trivial_indexes, streams[0], gpu_indexes[0]);
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
free(h_lwe_indexes_in);
|
||||
free(h_lwe_indexes_out);
|
||||
|
||||
if (!mem_reuse) {
|
||||
cuda_drop_async(tmp_lwe_before_ks, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(tmp_lwe_after_ks, streams[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
for (int i = 0; i < buffer.size(); i++) {
|
||||
switch (params.pbs_type) {
|
||||
@@ -721,10 +805,20 @@ template <typename Torus> struct int_radix_lut {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
buffer.clear();
|
||||
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_array_in_vec);
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_after_ks_vec);
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_after_pbs_vec);
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_trivial_indexes_vec);
|
||||
for (uint i = 0; i < active_gpu_count; i++)
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
lwe_array_in_vec.clear();
|
||||
lwe_after_ks_vec.clear();
|
||||
lwe_after_pbs_vec.clear();
|
||||
lwe_trivial_indexes_vec.clear();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
int_radix_params params;
|
||||
int_radix_lut<Torus> *lut;
|
||||
@@ -782,10 +876,6 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
for (int i = 0; i < bits_per_block; i++)
|
||||
h_lwe_indexes_in[i + j * bits_per_block] = j;
|
||||
}
|
||||
cuda_memcpy_async_to_gpu(lut->lwe_indexes_in, h_lwe_indexes_in,
|
||||
num_radix_blocks * bits_per_block *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
/**
|
||||
* the output should aim different lwe ciphertexts, so lwe_indexes_out =
|
||||
@@ -797,10 +887,9 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
for (int i = 0; i < num_radix_blocks * bits_per_block; i++)
|
||||
h_lwe_indexes_out[i] = i;
|
||||
|
||||
cuda_memcpy_async_to_gpu(lut->lwe_indexes_out, h_lwe_indexes_out,
|
||||
num_radix_blocks * bits_per_block *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
lut->set_lwe_indexes(streams[0], gpu_indexes[0], h_lwe_indexes_in,
|
||||
h_lwe_indexes_out);
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
free(h_lut_indexes);
|
||||
free(h_lwe_indexes_in);
|
||||
@@ -887,28 +976,52 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
|
||||
(params.big_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(tmp_bits, 0,
|
||||
bits_per_block * num_radix_blocks *
|
||||
(params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
tmp_shift_bits = (Torus *)cuda_malloc_async(
|
||||
max_num_bits_that_tell_shift * num_radix_blocks *
|
||||
(params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(tmp_shift_bits, 0,
|
||||
max_num_bits_that_tell_shift * num_radix_blocks *
|
||||
(params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
tmp_rotated = (Torus *)cuda_malloc_async(
|
||||
bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(tmp_rotated, 0,
|
||||
bits_per_block * num_radix_blocks *
|
||||
(params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
tmp_input_bits_a = (Torus *)cuda_malloc_async(
|
||||
bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(tmp_input_bits_a, 0,
|
||||
bits_per_block * num_radix_blocks *
|
||||
(params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
tmp_input_bits_b = (Torus *)cuda_malloc_async(
|
||||
bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(tmp_input_bits_b, 0,
|
||||
bits_per_block * num_radix_blocks *
|
||||
(params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
tmp_mux_inputs = (Torus *)cuda_malloc_async(
|
||||
bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(tmp_mux_inputs, 0,
|
||||
bits_per_block * num_radix_blocks *
|
||||
(params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
auto mux_lut_f = [](Torus x) -> Torus {
|
||||
// x is expected to be x = 0bcba
|
||||
@@ -974,10 +1087,10 @@ template <typename Torus> struct int_fullprop_buffer {
|
||||
|
||||
int_fullprop_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
|
||||
bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
lut = new int_radix_lut<Torus>(streams, gpu_indexes, 1, params, 2,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
lut = new int_radix_lut<Torus>(streams, gpu_indexes, 1, params, 2, 2,
|
||||
allocate_gpu_memory);
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
|
||||
@@ -1003,9 +1116,9 @@ template <typename Torus> struct int_fullprop_buffer {
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f_carry);
|
||||
|
||||
Torus lwe_indexes_size = num_radix_blocks * sizeof(Torus);
|
||||
Torus lwe_indexes_size = 2 * sizeof(Torus);
|
||||
Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
|
||||
for (int i = 0; i < num_radix_blocks; i++)
|
||||
for (int i = 0; i < 2; i++)
|
||||
h_lwe_indexes[i] = i;
|
||||
Torus *lwe_indexes = lut->get_lut_indexes(gpu_indexes[0], 0);
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
|
||||
@@ -1068,6 +1181,11 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
step_output = (Torus *)cuda_malloc_async(
|
||||
num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(generates_or_propagates, 0,
|
||||
num_radix_blocks * big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memset_async(step_output, 0, num_radix_blocks * big_lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
// declare functions for lut generation
|
||||
auto f_lut_does_block_generate_carry = [message_modulus](Torus x) -> Torus {
|
||||
@@ -1184,6 +1302,11 @@ template <typename Torus> struct int_overflowing_sub_memory {
|
||||
num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
step_output = (Torus *)cuda_malloc_async(
|
||||
num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(generates_or_propagates, 0,
|
||||
num_radix_blocks * big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memset_async(step_output, 0, num_radix_blocks * big_lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
// declare functions for lut generation
|
||||
auto f_lut_does_block_generate_carry = [message_modulus](Torus x) -> Torus {
|
||||
@@ -1273,6 +1396,7 @@ template <typename Torus> struct int_overflowing_sub_memory {
|
||||
|
||||
template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
Torus *new_blocks;
|
||||
Torus *new_blocks_copy;
|
||||
Torus *old_blocks;
|
||||
Torus *small_lwe_vector;
|
||||
int_radix_params params;
|
||||
@@ -1300,17 +1424,40 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
new_blocks = (Torus *)cuda_malloc_async(
|
||||
max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
new_blocks_copy = (Torus *)cuda_malloc_async(
|
||||
max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
old_blocks = (Torus *)cuda_malloc_async(
|
||||
max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
small_lwe_vector = (Torus *)cuda_malloc_async(
|
||||
max_pbs_count * (params.small_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(new_blocks, 0,
|
||||
max_pbs_count * (params.big_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(new_blocks_copy, 0,
|
||||
max_pbs_count * (params.big_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(old_blocks, 0,
|
||||
max_pbs_count * (params.big_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(small_lwe_vector, 0,
|
||||
max_pbs_count * (params.small_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
d_smart_copy_in = (int32_t *)cuda_malloc_async(
|
||||
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
|
||||
d_smart_copy_out = (int32_t *)cuda_malloc_async(
|
||||
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t),
|
||||
streams[0], gpu_indexes[0]);
|
||||
}
|
||||
|
||||
int_sum_ciphertexts_vec_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
@@ -1331,11 +1478,22 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
this->new_blocks = new_blocks;
|
||||
this->old_blocks = old_blocks;
|
||||
this->small_lwe_vector = small_lwe_vector;
|
||||
new_blocks_copy = (Torus *)cuda_malloc_async(
|
||||
max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(new_blocks_copy, 0,
|
||||
max_pbs_count * (params.big_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
d_smart_copy_in = (int32_t *)cuda_malloc_async(
|
||||
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
|
||||
d_smart_copy_out = (int32_t *)cuda_malloc_async(
|
||||
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t),
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memset_async(d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t),
|
||||
streams[0], gpu_indexes[0]);
|
||||
}
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
@@ -1349,8 +1507,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]);
|
||||
}
|
||||
|
||||
cuda_drop_async(new_blocks_copy, streams[0], gpu_indexes[0]);
|
||||
scp_mem->release(streams, gpu_indexes, gpu_count);
|
||||
|
||||
delete scp_mem;
|
||||
}
|
||||
};
|
||||
@@ -1666,6 +1824,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
|
||||
cudaStream_t *local_streams_1;
|
||||
cudaStream_t *local_streams_2;
|
||||
uint32_t active_gpu_count;
|
||||
|
||||
int_arithmetic_scalar_shift_buffer(cudaStream_t *streams,
|
||||
uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -1673,12 +1832,15 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
int_radix_params params,
|
||||
uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
// In the arithmetic shift, a PBS has to be applied to the last rotated
|
||||
// block twice: once to shift it, once to compute the padding block to be
|
||||
// copied onto all blocks to the left of the last rotated block
|
||||
local_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
local_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
local_streams_1 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
local_streams_2 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
local_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
local_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
}
|
||||
@@ -1689,12 +1851,12 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
uint32_t big_lwe_size = params.big_lwe_dimension + 1;
|
||||
uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
tmp_rotated = (Torus *)cuda_malloc_async((num_radix_blocks + 2) *
|
||||
tmp_rotated = (Torus *)cuda_malloc_async((num_radix_blocks + 3) *
|
||||
big_lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
cuda_memset_async(tmp_rotated, 0,
|
||||
(num_radix_blocks + 2) * big_lwe_size_bytes, streams[0],
|
||||
(num_radix_blocks + 3) * big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
uint32_t num_bits_in_block = (uint32_t)std::log2(params.message_modulus);
|
||||
@@ -1811,7 +1973,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
cuda_destroy_stream(local_streams_1[j], gpu_indexes[j]);
|
||||
cuda_destroy_stream(local_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
@@ -1840,20 +2002,24 @@ template <typename Torus> struct int_zero_out_if_buffer {
|
||||
|
||||
cudaStream_t *true_streams;
|
||||
cudaStream_t *false_streams;
|
||||
uint32_t active_gpu_count;
|
||||
|
||||
int_zero_out_if_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
|
||||
Torus big_size =
|
||||
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
|
||||
if (allocate_gpu_memory) {
|
||||
tmp = (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
|
||||
// We may use a different stream to allow concurrent operation
|
||||
true_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
false_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
true_streams =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
false_streams =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
true_streams[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
false_streams[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
}
|
||||
@@ -1862,7 +2028,7 @@ template <typename Torus> struct int_zero_out_if_buffer {
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_async(tmp, streams[0], gpu_indexes[0]);
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
cuda_destroy_stream(true_streams[j], gpu_indexes[j]);
|
||||
cuda_destroy_stream(false_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
@@ -1996,7 +2162,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
Torus total_modulus = params.message_modulus * params.carry_modulus;
|
||||
uint32_t max_value = total_modulus - 1;
|
||||
uint32_t max_value = (total_modulus - 1) / (params.message_modulus - 1);
|
||||
|
||||
int max_chunks = (num_radix_blocks + max_value - 1) / max_value;
|
||||
tmp_block_accumulated = (Torus *)cuda_malloc_async(
|
||||
@@ -2012,6 +2178,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
|
||||
uint32_t gpu_count) {
|
||||
for (auto &lut : is_equal_to_lut_map) {
|
||||
lut.second->release(streams, gpu_indexes, gpu_count);
|
||||
delete (lut.second);
|
||||
}
|
||||
is_equal_to_lut_map.clear();
|
||||
|
||||
@@ -2294,6 +2461,7 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
int_radix_lut<Torus> *signed_msb_lut;
|
||||
cudaStream_t *lsb_streams;
|
||||
cudaStream_t *msb_streams;
|
||||
uint32_t active_gpu_count;
|
||||
|
||||
int_comparison_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, COMPARISON_TYPE op,
|
||||
@@ -2303,14 +2471,18 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
this->op = op;
|
||||
this->is_signed = is_signed;
|
||||
|
||||
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
|
||||
identity_lut_f = [](Torus x) -> Torus { return x; };
|
||||
|
||||
auto big_lwe_size = params.big_lwe_dimension + 1;
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
lsb_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
msb_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
lsb_streams =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
msb_streams =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
lsb_streams[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
msb_streams[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
}
|
||||
@@ -2474,7 +2646,7 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
signed_msb_lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete (signed_msb_lut);
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
cuda_destroy_stream(lsb_streams[j], gpu_indexes[j]);
|
||||
cuda_destroy_stream(msb_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
@@ -2485,6 +2657,7 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
|
||||
template <typename Torus> struct int_div_rem_memory {
|
||||
int_radix_params params;
|
||||
uint32_t active_gpu_count;
|
||||
|
||||
// memory objects for other operations
|
||||
int_logical_scalar_shift_buffer<Torus> *shift_mem_1;
|
||||
@@ -2720,6 +2893,8 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
int_div_rem_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t num_blocks, bool allocate_gpu_memory) {
|
||||
active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
|
||||
|
||||
this->params = params;
|
||||
shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
|
||||
@@ -2739,11 +2914,15 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
init_lookup_tables(streams, gpu_indexes, gpu_count, num_blocks);
|
||||
init_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks);
|
||||
|
||||
sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_3 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_4 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
sub_streams_1 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_2 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_3 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_4 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_3[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
@@ -2814,7 +2993,7 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
delete[] merge_overflow_flags_luts;
|
||||
|
||||
// release sub streams
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
|
||||
cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
|
||||
cuda_destroy_stream(sub_streams_3[i], gpu_indexes[i]);
|
||||
@@ -2868,39 +3047,45 @@ template <typename Torus> struct int_last_block_inner_propagate_memory {
|
||||
auto f_last_block_inner_propagation_lut =
|
||||
[op, message_modulus, message_bit_mask,
|
||||
bits_of_message](Torus lhs_block, Torus rhs_block) -> Torus {
|
||||
Torus local_rhs_block = 0;
|
||||
uint64_t rhs_block_modified;
|
||||
if (op == SIGNED_OPERATION::SUBTRACTION) {
|
||||
Torus flipped_rhs = !rhs_block;
|
||||
local_rhs_block = (flipped_rhs << 1) & message_bit_mask;
|
||||
} else {
|
||||
local_rhs_block = (rhs_block << 1) & message_bit_mask;
|
||||
};
|
||||
// Subtraction is done by adding the negation
|
||||
// Negation(x) = bit_flip(x) + 1
|
||||
// Only add the flipped value, the +1 will be resolved by carry
|
||||
// propagation computation
|
||||
uint64_t flipped_rhs = ~rhs_block;
|
||||
|
||||
Torus local_lhs_block = (lhs_block << 1) & message_bit_mask;
|
||||
// Remove the last bit, it's not interesting in this step
|
||||
rhs_block_modified = (flipped_rhs << 1) & message_bit_mask;
|
||||
} else {
|
||||
rhs_block_modified = (rhs_block << 1) & message_bit_mask;
|
||||
}
|
||||
|
||||
uint64_t lhs_block_modified = (lhs_block << 1) & message_bit_mask;
|
||||
|
||||
// whole_result contains the result of addition with
|
||||
// the carry being in the first bit of carry space
|
||||
// the message space contains the message, but with one 0
|
||||
// on the right (lsb)
|
||||
Torus whole_result = local_lhs_block + local_rhs_block;
|
||||
Torus carry = whole_result >> bits_of_message;
|
||||
Torus result = (whole_result & message_bit_mask) >> 1;
|
||||
Torus propagation_result = 0;
|
||||
// on the right (LSB)
|
||||
uint64_t whole_result = lhs_block_modified + rhs_block_modified;
|
||||
uint64_t carry = whole_result >> bits_of_message;
|
||||
uint64_t result = (whole_result & message_bit_mask) >> 1;
|
||||
OUTPUT_CARRY propagation_result;
|
||||
if (carry == 1) {
|
||||
// Addition of bits before last one generates a carry
|
||||
// Addition of bits before the last one generates a carry
|
||||
propagation_result = OUTPUT_CARRY::GENERATED;
|
||||
} else if (result == ((message_modulus - 1) >> 1)) {
|
||||
// Addition of bits before last one puts the bits
|
||||
// in a state that makes it so that an input carry into last block
|
||||
// gets propagated to last bit.
|
||||
// Addition of bits before the last one puts the bits
|
||||
// in a state that makes it so that an input carry into the last block
|
||||
// gets propagated to the last bit.
|
||||
propagation_result = OUTPUT_CARRY::PROPAGATED;
|
||||
} else {
|
||||
propagation_result = OUTPUT_CARRY::NONE;
|
||||
};
|
||||
}
|
||||
|
||||
// Shift the propagation result in carry part
|
||||
// Shift the propagation result in the carry part
|
||||
// to have less noise growth later
|
||||
return propagation_result << bits_of_message;
|
||||
return (static_cast<uint64_t>(propagation_result) << bits_of_message);
|
||||
};
|
||||
|
||||
last_block_inner_propagation_lut = new int_radix_lut<Torus>(
|
||||
@@ -2985,12 +3170,12 @@ template <typename Torus> struct int_resolve_signed_overflow_memory {
|
||||
|
||||
template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
|
||||
int_radix_params params;
|
||||
uint32_t active_gpu_count;
|
||||
|
||||
// memory objects for other operations
|
||||
int_sc_prop_memory<Torus> *scp_mem;
|
||||
int_last_block_inner_propagate_memory<Torus> *las_block_prop_mem;
|
||||
int_resolve_signed_overflow_memory<Torus> *resolve_overflow_mem;
|
||||
// lookupt tables
|
||||
|
||||
// sub streams
|
||||
cudaStream_t *sub_streams_1;
|
||||
@@ -2999,6 +3184,7 @@ template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
|
||||
// temporary device buffers
|
||||
Torus *result; // num_blocks
|
||||
Torus *input_carries; // num_blocks
|
||||
Torus *neg_rhs; // num_blocks
|
||||
Torus *output_carry; // single block
|
||||
Torus *last_block_inner_propagation; // single block
|
||||
|
||||
@@ -3011,6 +3197,9 @@ template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
|
||||
result = (Torus *)cuda_malloc_async(
|
||||
big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
|
||||
neg_rhs = (Torus *)cuda_malloc_async(
|
||||
big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
|
||||
input_carries = (Torus *)cuda_malloc_async(
|
||||
big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
output_carry = (Torus *)cuda_malloc_async(big_lwe_size * sizeof(Torus),
|
||||
@@ -3025,13 +3214,17 @@ template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
|
||||
int_radix_params params, uint32_t num_blocks, SIGNED_OPERATION op,
|
||||
bool allocate_gpu_memory) {
|
||||
this->params = params;
|
||||
active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
|
||||
|
||||
allocate_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks);
|
||||
allocate_temporary_buffers(streams, gpu_indexes, active_gpu_count,
|
||||
num_blocks);
|
||||
|
||||
// initialize streams
|
||||
sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
sub_streams_1 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_2 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
}
|
||||
@@ -3061,12 +3254,13 @@ template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
|
||||
|
||||
// temporary device buffers
|
||||
cuda_drop_async(result, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(neg_rhs, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(input_carries, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(output_carry, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(last_block_inner_propagation, streams[0], gpu_indexes[0]);
|
||||
|
||||
// sub streams
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
|
||||
cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
|
||||
}
|
||||
|
||||
@@ -9,15 +9,28 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t gpu_offset = 0);
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t gpu_offset = 0);
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
void scratch_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
|
||||
void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
|
||||
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_lwes);
|
||||
|
||||
void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
|
||||
uint32_t gpu_index,
|
||||
int8_t **fp_ks_buffer);
|
||||
}
|
||||
|
||||
#endif // CNCRT_KS_H_
|
||||
|
||||
@@ -26,14 +26,12 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
|
||||
void scratch_cuda_programmable_bootstrap_amortized_32(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_amortized_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
@@ -41,8 +39,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory);
|
||||
uint32_t num_samples);
|
||||
|
||||
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
@@ -50,8 +47,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t gpu_offset = 0);
|
||||
uint32_t num_samples);
|
||||
|
||||
void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
|
||||
uint32_t gpu_index,
|
||||
@@ -60,14 +56,12 @@ void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
|
||||
void scratch_cuda_programmable_bootstrap_32(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
@@ -75,8 +69,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t gpu_offset = 0);
|
||||
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride,
|
||||
bool do_modulus_switch);
|
||||
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
@@ -84,44 +78,34 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t gpu_offset = 0);
|
||||
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride,
|
||||
bool do_modulus_switch);
|
||||
|
||||
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
|
||||
|
||||
uint64_t get_buffer_size_programmable_bootstrap_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_programmable_bootstrap_step_one(
|
||||
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_one(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_programmable_bootstrap_step_two(
|
||||
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_two(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
uint64_t
|
||||
get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
uint64_t
|
||||
get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
@@ -129,21 +113,19 @@ get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_tbc(
|
||||
uint64_t get_buffer_size_partial_sm_programmable_bootstrap_tbc(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
|
||||
uint64_t get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // tbc
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
uint64_t
|
||||
get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
@@ -151,15 +133,14 @@ get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
uint64_t
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ bool
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap(
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory);
|
||||
bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
|
||||
template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
|
||||
|
||||
@@ -178,7 +159,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
|
||||
this->pbs_variant = pbs_variant;
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
switch (pbs_variant) {
|
||||
@@ -255,7 +236,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
|
||||
bool supports_dsm =
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
|
||||
Torus>(polynomial_size, max_shared_memory);
|
||||
Torus>(polynomial_size);
|
||||
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
|
||||
@@ -314,10 +295,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
};
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap_cg(
|
||||
uint64_t get_buffer_size_programmable_bootstrap_cg(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
@@ -343,8 +324,7 @@ template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count,
|
||||
uint32_t num_samples,
|
||||
uint32_t max_shared_memory);
|
||||
uint32_t num_samples);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
@@ -353,8 +333,8 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
uint32_t lut_stride, bool do_modulus_switch);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
@@ -363,8 +343,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
uint32_t lut_stride, bool do_modulus_switch);
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
template <typename Torus>
|
||||
@@ -374,43 +354,45 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
uint32_t lut_stride, bool do_modulus_switch);
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
template <typename Torus>
|
||||
void scratch_cuda_programmable_bootstrap_tbc(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
#endif
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
template <typename Torus>
|
||||
void scratch_cuda_programmable_bootstrap_cg(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
template <typename Torus>
|
||||
void scratch_cuda_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count,
|
||||
uint32_t max_shared_memory);
|
||||
uint32_t level_count);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
|
||||
template <typename T>
|
||||
__device__ const T *get_ith_mask_kth_block(const T *ptr, int i, int k,
|
||||
int level, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
@@ -422,8 +404,8 @@ __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
|
||||
int glwe_dimension, uint32_t level_count);
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
|
||||
__device__ const T *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
const T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -8,7 +8,7 @@ extern "C" {
|
||||
|
||||
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t max_shared_memory);
|
||||
uint32_t num_samples);
|
||||
|
||||
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
|
||||
void *stream, uint32_t gpu_index, void *dest, void *src,
|
||||
@@ -17,10 +17,8 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
|
||||
|
||||
void scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t chunk_size = 0);
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
@@ -28,9 +26,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset,
|
||||
uint32_t lwe_chunk_size = 0);
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
uint32_t lut_stride);
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
uint32_t gpu_index,
|
||||
@@ -38,23 +35,20 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ bool
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory);
|
||||
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
uint32_t level_count);
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
template <typename Torus, typename STorus>
|
||||
template <typename Torus>
|
||||
void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
@@ -64,24 +58,14 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t gpu_offset, uint32_t lwe_chunk_size);
|
||||
uint32_t lut_count, uint32_t lut_stride);
|
||||
#endif
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
template <typename Torus>
|
||||
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
@@ -91,16 +75,13 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t gpu_offset, uint32_t lwe_chunk_size = 0);
|
||||
uint32_t lut_count, uint32_t lut_stride);
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
template <typename Torus>
|
||||
void scratch_cuda_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
@@ -110,44 +91,34 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t gpu_offset, uint32_t lwe_chunk_size = 0);
|
||||
uint32_t lut_count, uint32_t lut_stride);
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
|
||||
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
|
||||
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
|
||||
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
|
||||
uint64_t get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
|
||||
uint64_t get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
|
||||
uint64_t get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
|
||||
uint64_t get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint64_t get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
|
||||
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
@@ -156,7 +127,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
int8_t *d_mem_acc_step_two = NULL;
|
||||
int8_t *d_mem_acc_cg = NULL;
|
||||
int8_t *d_mem_acc_tbc = NULL;
|
||||
|
||||
uint32_t lwe_chunk_size;
|
||||
double2 *keybundle_fft;
|
||||
Torus *global_accumulator;
|
||||
double2 *global_accumulator_fft;
|
||||
@@ -168,6 +139,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
|
||||
PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
|
||||
this->pbs_variant = pbs_variant;
|
||||
this->lwe_chunk_size = lwe_chunk_size;
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
// default
|
||||
@@ -317,8 +289,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
};
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t max_shared_memory);
|
||||
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
#endif // CUDA_MULTI_BIT_H
|
||||
|
||||
@@ -1,17 +1,3 @@
|
||||
set(SOURCES
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
|
||||
file(GLOB_RECURSE SOURCES "*.cu")
|
||||
add_library(tfhe_cuda_backend STATIC ${SOURCES})
|
||||
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "ciphertext.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
|
||||
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
|
||||
uint32_t gpu_index,
|
||||
@@ -19,3 +20,58 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)dest,
|
||||
(uint64_t *)src, number_of_cts, lwe_dimension);
|
||||
}
|
||||
|
||||
void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out, void *glwe_array_in,
|
||||
uint32_t *nth_array, uint32_t num_nths,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 512:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 1024:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 2048:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 4096:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 8192:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 16384:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: unsupported polynomial size. Supported "
|
||||
"N's are powers of two in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
#include "ciphertext.h"
|
||||
#include "device.h"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include <cstdint>
|
||||
|
||||
template <typename T>
|
||||
@@ -25,4 +26,40 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
|
||||
cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
|
||||
uint32_t *nth_array, uint32_t glwe_dimension) {
|
||||
|
||||
const int input_id = blockIdx.x;
|
||||
|
||||
const int glwe_input_size = (glwe_dimension + 1) * params::degree;
|
||||
const int lwe_output_size = glwe_dimension * params::degree + 1;
|
||||
|
||||
auto lwe_out = lwe_array_out + input_id * lwe_output_size;
|
||||
|
||||
// We assume each GLWE will store the first polynomial_size inputs
|
||||
uint32_t lwe_per_glwe = params::degree;
|
||||
auto glwe_in = glwe_array_in + (input_id / lwe_per_glwe) * glwe_input_size;
|
||||
|
||||
// nth is ensured to be in [0, lwe_per_glwe)
|
||||
auto nth = nth_array[input_id] % lwe_per_glwe;
|
||||
|
||||
sample_extract_mask<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
|
||||
sample_extract_body<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *lwe_array_out, Torus *glwe_array_in,
|
||||
uint32_t *nth_array, uint32_t num_nths,
|
||||
uint32_t glwe_dimension) {
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
dim3 grid(num_nths);
|
||||
dim3 thds(params::degree / params::opt);
|
||||
sample_extract<Torus, params><<<grid, thds, 0, stream>>>(
|
||||
lwe_array_out, glwe_array_in, nth_array, glwe_dimension);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -9,16 +9,14 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t gpu_offset) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
host_keyswitch_lwe_ciphertext_vector<uint32_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes), static_cast<uint32_t *>(ksk),
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples,
|
||||
gpu_offset);
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
|
||||
}
|
||||
|
||||
/* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
|
||||
@@ -41,14 +39,44 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t gpu_offset) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
host_keyswitch_lwe_ciphertext_vector<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples,
|
||||
gpu_offset);
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
|
||||
}
|
||||
|
||||
void scratch_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
|
||||
bool allocate_gpu_memory) {
|
||||
scratch_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, fp_ks_buffer,
|
||||
glwe_dimension, polynomial_size, num_lwes, allocate_gpu_memory);
|
||||
}
|
||||
/* Perform functional packing keyswitch on a batch of 64 bits input LWE
|
||||
* ciphertexts.
|
||||
*/
|
||||
void cuda_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
|
||||
void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
|
||||
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {
|
||||
|
||||
host_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(glwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(fp_ksk_array), fp_ks_buffer, input_lwe_dimension,
|
||||
output_glwe_dimension, output_polynomial_size, base_log, level_count,
|
||||
num_lwes);
|
||||
}
|
||||
|
||||
void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
|
||||
uint32_t gpu_index,
|
||||
int8_t **fp_ks_buffer) {
|
||||
cuda_drop_async(*fp_ks_buffer, static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "torus.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
@@ -38,26 +39,25 @@ __device__ Torus *get_ith_block(Torus *ksk, int i, int level,
|
||||
// threads in y are used to paralelize the lwe_dimension_in loop.
|
||||
// shared memory is used to store intermediate results of the reduction.
|
||||
template <typename Torus>
|
||||
__global__ void keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
Torus *ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, int gpu_offset) {
|
||||
__global__ void
|
||||
keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const Torus *__restrict__ ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count) {
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int shmem_index = threadIdx.x + threadIdx.y * blockDim.x;
|
||||
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
Torus *lwe_acc_out = (Torus *)sharedmem;
|
||||
auto block_lwe_array_out =
|
||||
get_chunk(lwe_array_out, lwe_output_indexes[blockIdx.y + gpu_offset],
|
||||
lwe_dimension_out + 1);
|
||||
auto block_lwe_array_out = get_chunk(
|
||||
lwe_array_out, lwe_output_indexes[blockIdx.y], lwe_dimension_out + 1);
|
||||
|
||||
if (tid <= lwe_dimension_out) {
|
||||
|
||||
Torus local_lwe_out = 0;
|
||||
auto block_lwe_array_in =
|
||||
get_chunk(lwe_array_in, lwe_input_indexes[blockIdx.y + gpu_offset],
|
||||
lwe_dimension_in + 1);
|
||||
auto block_lwe_array_in = get_chunk(
|
||||
lwe_array_in, lwe_input_indexes[blockIdx.y], lwe_dimension_in + 1);
|
||||
|
||||
if (tid == lwe_dimension_out && threadIdx.y == 0) {
|
||||
local_lwe_out = block_lwe_array_in[lwe_dimension_in];
|
||||
@@ -99,12 +99,11 @@ __global__ void keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void cuda_keyswitch_lwe_ciphertext_vector(
|
||||
__host__ void host_keyswitch_lwe_ciphertext_vector(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t gpu_offset = 0) {
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
@@ -120,42 +119,196 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
|
||||
|
||||
keyswitch<Torus><<<grid, threads, shared_mem, stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, gpu_offset);
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void execute_keyswitch(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, Torus **ksks,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, bool sync_streams = true) {
|
||||
void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
const LweArrayVariant<Torus> &lwe_array_out,
|
||||
const LweArrayVariant<Torus> &lwe_output_indexes,
|
||||
const LweArrayVariant<Torus> &lwe_array_in,
|
||||
const LweArrayVariant<Torus> &lwe_input_indexes,
|
||||
Torus **ksks, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
/// If the number of radix blocks is lower than the number of GPUs, not all
|
||||
/// GPUs will be active and there will be 1 input per GPU
|
||||
auto active_gpu_count = get_active_gpu_count(num_samples, gpu_count);
|
||||
int num_samples_on_gpu_0 = get_num_inputs_on_gpu(num_samples, 0, gpu_count);
|
||||
if (sync_streams)
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
#pragma omp parallel for num_threads(active_gpu_count)
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
int num_samples_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
|
||||
int gpu_offset = get_gpu_offset(num_samples, i, gpu_count);
|
||||
|
||||
Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
|
||||
Torus *current_lwe_output_indexes =
|
||||
GET_VARIANT_ELEMENT(lwe_output_indexes, i);
|
||||
Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
|
||||
Torus *current_lwe_input_indexes =
|
||||
GET_VARIANT_ELEMENT(lwe_input_indexes, i);
|
||||
|
||||
// Compute Keyswitch
|
||||
cuda_keyswitch_lwe_ciphertext_vector<Torus>(
|
||||
streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
|
||||
lwe_array_in, lwe_input_indexes, ksks[i], lwe_dimension_in,
|
||||
lwe_dimension_out, base_log, level_count, num_samples_on_gpu,
|
||||
gpu_offset);
|
||||
host_keyswitch_lwe_ciphertext_vector<Torus>(
|
||||
streams[i], gpu_indexes[i], current_lwe_array_out,
|
||||
current_lwe_output_indexes, current_lwe_array_in,
|
||||
current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
|
||||
base_log, level_count, num_samples_on_gpu);
|
||||
}
|
||||
}
|
||||
|
||||
if (sync_streams)
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
template <typename Torus>
|
||||
__host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
|
||||
cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
|
||||
|
||||
if (allocate_gpu_memory)
|
||||
*fp_ks_buffer = (int8_t *)cuda_malloc_async(
|
||||
2 * num_lwes * glwe_accumulator_size * sizeof(Torus), stream,
|
||||
gpu_index);
|
||||
}
|
||||
|
||||
// public functional packing keyswitch for a single LWE ciphertext
|
||||
//
|
||||
// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
|
||||
// different thread blocks at the x-axis to work on that input.
|
||||
template <typename Torus>
|
||||
__device__ void packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
|
||||
Torus *glwe_out, Torus *lwe_in, Torus *fp_ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count) {
|
||||
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
size_t glwe_size = (glwe_dimension + 1);
|
||||
|
||||
if (tid < glwe_size * polynomial_size) {
|
||||
const int local_index = threadIdx.x;
|
||||
// the output_glwe is split in polynomials and each x-block takes one of
|
||||
// them
|
||||
size_t poly_id = blockIdx.x;
|
||||
size_t coef_per_block = blockDim.x;
|
||||
|
||||
// number of coefficients inside fp-ksk block for each lwe_input coefficient
|
||||
size_t ksk_block_size = glwe_size * polynomial_size * level_count;
|
||||
|
||||
// initialize accumulator to 0
|
||||
glwe_out[tid] = SEL(0, lwe_in[lwe_dimension_in],
|
||||
tid == glwe_dimension * polynomial_size);
|
||||
|
||||
// Iterate through all lwe elements
|
||||
for (int i = 0; i < lwe_dimension_in; i++) {
|
||||
// Round and prepare decomposition
|
||||
Torus a_i = round_to_closest_multiple(lwe_in[i], base_log, level_count);
|
||||
|
||||
Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
|
||||
Torus mod_b_mask = (1ll << base_log) - 1ll;
|
||||
|
||||
// block of key for current lwe coefficient (cur_input_lwe[i])
|
||||
auto ksk_block = &fp_ksk[i * ksk_block_size];
|
||||
for (int j = 0; j < level_count; j++) {
|
||||
auto ksk_glwe = &ksk_block[j * glwe_size * polynomial_size];
|
||||
// Iterate through each level and multiply by the ksk piece
|
||||
auto ksk_glwe_chunk = &ksk_glwe[poly_id * coef_per_block];
|
||||
Torus decomposed = decompose_one<Torus>(state, mod_b_mask, base_log);
|
||||
glwe_out[tid] -= decomposed * ksk_glwe_chunk[local_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// public functional packing keyswitch for a batch of LWE ciphertexts
|
||||
//
|
||||
// Selects the input each thread is working on using the y-block index.
|
||||
//
|
||||
// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
|
||||
// different thread blocks at the x-axis to work on that input.
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
packing_keyswitch_lwe_list_to_glwe(Torus *glwe_array_out, Torus *lwe_array_in,
|
||||
Torus *fp_ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, Torus *d_mem) {
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
const int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
|
||||
const int lwe_size = (lwe_dimension_in + 1);
|
||||
|
||||
const int input_id = blockIdx.y;
|
||||
const int degree = input_id;
|
||||
|
||||
// Select an input
|
||||
auto lwe_in = lwe_array_in + input_id * lwe_size;
|
||||
auto ks_glwe_out = d_mem + input_id * glwe_accumulator_size;
|
||||
auto glwe_out = glwe_array_out + input_id * glwe_accumulator_size;
|
||||
// KS LWE to GLWE
|
||||
packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
|
||||
ks_glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension,
|
||||
polynomial_size, base_log, level_count);
|
||||
|
||||
// P * x ^degree
|
||||
auto in_poly = ks_glwe_out + (tid / polynomial_size) * polynomial_size;
|
||||
auto out_result = glwe_out + (tid / polynomial_size) * polynomial_size;
|
||||
polynomial_accumulate_monic_monomial_mul(out_result, in_poly, degree,
|
||||
tid % polynomial_size,
|
||||
polynomial_size, 1, true);
|
||||
}
|
||||
|
||||
/// To-do: Rewrite this kernel for efficiency
|
||||
template <typename Torus>
|
||||
__global__ void accumulate_glwes(Torus *glwe_out, Torus *glwe_array_in,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t num_lwes) {
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (tid < (glwe_dimension + 1) * polynomial_size) {
|
||||
glwe_out[tid] = glwe_array_in[tid];
|
||||
|
||||
// Accumulate
|
||||
for (int i = 1; i < num_lwes; i++) {
|
||||
auto glwe_in = glwe_array_in + i * (glwe_dimension + 1) * polynomial_size;
|
||||
glwe_out[tid] += glwe_in[tid];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_packing_keyswitch_lwe_list_to_glwe(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out,
|
||||
Torus *lwe_array_in, Torus *fp_ksk_array, int8_t *fp_ks_buffer,
|
||||
uint32_t lwe_dimension_in, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_lwes) {
|
||||
|
||||
if (num_lwes > polynomial_size)
|
||||
PANIC("Cuda error: too many LWEs to pack. The number of LWEs should be "
|
||||
"smaller than "
|
||||
"polynomial_size.")
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
|
||||
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
getNumBlocksAndThreads(glwe_accumulator_size, 128, num_blocks, num_threads);
|
||||
|
||||
dim3 grid(num_blocks, num_lwes);
|
||||
dim3 threads(num_threads);
|
||||
|
||||
auto d_mem = (Torus *)fp_ks_buffer;
|
||||
auto d_tmp_glwe_array_out = d_mem + num_lwes * glwe_accumulator_size;
|
||||
|
||||
// individually keyswitch each lwe
|
||||
packing_keyswitch_lwe_list_to_glwe<<<grid, threads, 0, stream>>>(
|
||||
d_tmp_glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
// accumulate to a single glwe
|
||||
accumulate_glwes<<<num_blocks, threads, 0, stream>>>(
|
||||
glwe_out, d_tmp_glwe_array_out, glwe_dimension, polynomial_size,
|
||||
num_lwes);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,9 +1,16 @@
|
||||
#ifndef CNCRT_TORUS_CUH
|
||||
#define CNCRT_TORUS_CUH
|
||||
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "types/int128.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <limits>
|
||||
|
||||
template <typename T>
|
||||
__host__ __device__ __forceinline__ constexpr double get_two_pow_torus_bits() {
|
||||
return (sizeof(T) == 4) ? 4294967296.0 : 18446744073709551616.0;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ inline void typecast_double_to_torus(double x, T &r) {
|
||||
r = T(x);
|
||||
@@ -26,49 +33,63 @@ __device__ inline void typecast_double_to_torus<uint64_t>(double x,
|
||||
r = lll;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ inline void typecast_double_round_to_torus(double x, T &r) {
|
||||
constexpr double mx = get_two_pow_torus_bits<T>();
|
||||
// floor must be used here because round has an issue with rounding .5,
|
||||
// as it rounds away from zero.
|
||||
double frac = x - floor(x);
|
||||
frac *= mx;
|
||||
typecast_double_to_torus(round(frac), r);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
|
||||
uint32_t level_count) {
|
||||
T shift = sizeof(T) * 8 - level_count * base_log;
|
||||
T mask = 1ll << (shift - 1);
|
||||
T b = (x & mask) >> (shift - 1);
|
||||
const T non_rep_bit_count = sizeof(T) * 8 - level_count * base_log;
|
||||
const T shift = non_rep_bit_count - 1;
|
||||
T res = x >> shift;
|
||||
res += b;
|
||||
res <<= shift;
|
||||
return res;
|
||||
res += 1;
|
||||
res &= (T)(-2);
|
||||
return res << shift;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void rescale_torus_element(T element, T &output,
|
||||
uint32_t log_shift) {
|
||||
output =
|
||||
round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
|
||||
(double)log_shift);
|
||||
__device__ __forceinline__ void modulus_switch(T input, T &output,
|
||||
uint32_t log_modulus) {
|
||||
constexpr uint32_t BITS = sizeof(T) * 8;
|
||||
output = input + (((T)1) << (BITS - log_modulus - 1));
|
||||
output >>= (BITS - log_modulus);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T rescale_torus_element(T element,
|
||||
uint32_t log_shift) {
|
||||
return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
|
||||
(double)log_shift);
|
||||
__device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) {
|
||||
T output;
|
||||
modulus_switch(input, output, log_modulus);
|
||||
return output;
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ __forceinline__ void
|
||||
rescale_torus_element<uint32_t>(uint32_t element, uint32_t &output,
|
||||
uint32_t log_shift) {
|
||||
output =
|
||||
round(__uint2double_rn(element) /
|
||||
(__uint2double_rn(std::numeric_limits<uint32_t>::max()) + 1.0) *
|
||||
__uint2double_rn(log_shift));
|
||||
template <typename Torus>
|
||||
__global__ void modulus_switch_inplace(Torus *array, int size,
|
||||
uint32_t log_modulus) {
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (tid < size) {
|
||||
array[tid] = modulus_switch(array[tid], log_modulus);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ __forceinline__ void
|
||||
rescale_torus_element<uint64_t>(uint64_t element, uint64_t &output,
|
||||
uint32_t log_shift) {
|
||||
output = round(__ull2double_rn(element) /
|
||||
(__ull2double_rn(std::numeric_limits<uint64_t>::max()) + 1.0) *
|
||||
__uint2double_rn(log_shift));
|
||||
template <typename Torus>
|
||||
__host__ void host_modulus_switch_inplace(cudaStream_t stream,
|
||||
uint32_t gpu_index, Torus *array,
|
||||
int size, uint32_t log_modulus) {
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
int num_threads = 0, num_blocks = 0;
|
||||
getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
|
||||
|
||||
modulus_switch_inplace<<<num_blocks, num_threads, 0, stream>>>(array, size,
|
||||
log_modulus);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif // CNCRT_TORUS_H
|
||||
|
||||
@@ -137,6 +137,30 @@ void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
}
|
||||
}
|
||||
|
||||
/// Copy memory within a GPU
|
||||
void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
uint32_t gpu_index) {
|
||||
if (size == 0)
|
||||
return;
|
||||
cudaPointerAttributes attr_dest;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
|
||||
if (attr_dest.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
|
||||
}
|
||||
cudaPointerAttributes attr_src;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
|
||||
if (attr_src.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
|
||||
}
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
if (attr_src.device == attr_dest.device) {
|
||||
check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice));
|
||||
} else {
|
||||
check_cuda_error(
|
||||
cudaMemcpyPeer(dest, attr_dest.device, src, attr_src.device, size));
|
||||
}
|
||||
}
|
||||
|
||||
/// Synchronizes device
|
||||
void cuda_synchronize_device(uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
@@ -166,19 +190,21 @@ __global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
|
||||
template <typename Torus>
|
||||
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *d_array, Torus value, Torus n) {
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, d_array));
|
||||
if (attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid dest device pointer in cuda set value.")
|
||||
}
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
int block_size = 256;
|
||||
int num_blocks = (n + block_size - 1) / block_size;
|
||||
if (n > 0) {
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, d_array));
|
||||
if (attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid dest device pointer in cuda set value.")
|
||||
}
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
int block_size = 256;
|
||||
int num_blocks = (n + block_size - 1) / block_size;
|
||||
|
||||
// Launch the kernel
|
||||
cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
|
||||
n);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
// Launch the kernel
|
||||
cuda_set_value_kernel<Torus>
|
||||
<<<num_blocks, block_size, 0, stream>>>(d_array, value, n);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
/// Explicitly instantiate cuda_set_value_async for 32 and 64 bits
|
||||
@@ -241,7 +267,6 @@ void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
|
||||
|
||||
/// Get the maximum size for the shared memory
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
int max_shared_memory = 0;
|
||||
cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
|
||||
gpu_index);
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#include "twiddles.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
|
||||
using Index = unsigned;
|
||||
/*
|
||||
* Direct negacyclic FFT:
|
||||
* - before the FFT the N real coefficients are stored into a
|
||||
@@ -31,290 +32,81 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
|
||||
* full loop, which should increase performance
|
||||
*/
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
size_t twid_id;
|
||||
size_t i1, i2;
|
||||
double2 u, v, w;
|
||||
__syncthreads();
|
||||
constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
|
||||
constexpr Index LOG2_DEGREE = params::log2_degree;
|
||||
constexpr Index HALF_DEGREE = params::degree >> 1;
|
||||
constexpr Index STRIDE = params::degree / params::opt;
|
||||
|
||||
Index tid = threadIdx.x;
|
||||
double2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;
|
||||
|
||||
// load into registers
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
|
||||
u[i] = A[tid];
|
||||
v[i] = A[tid + HALF_DEGREE];
|
||||
|
||||
tid += STRIDE;
|
||||
}
|
||||
|
||||
// level 1
|
||||
// we don't make actual complex multiplication on level1 since we have only
|
||||
// one twiddle, it's real and image parts are equal, so we can multiply
|
||||
// it with simpler operations
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
i1 = tid;
|
||||
i2 = tid + params::degree / 2;
|
||||
|
||||
u = A[i1];
|
||||
v = A[i2] * (double2){0.707106781186547461715008466854,
|
||||
0.707106781186547461715008466854};
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
|
||||
w = v[i] * (double2){0.707106781186547461715008466854,
|
||||
0.707106781186547461715008466854};
|
||||
v[i] = u[i] - w;
|
||||
u[i] = u[i] + w;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 2
|
||||
// from this level there are more than one twiddles and none of them has equal
|
||||
// real and imag parts, so complete complex multiplication is needed
|
||||
// for each level params::degree / 2^level represents number of coefficients
|
||||
// inside divided chunk of specific level
|
||||
//
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 4);
|
||||
i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
|
||||
i2 = i1 + params::degree / 4;
|
||||
Index twiddle_shift = 1;
|
||||
for (Index l = LOG2_DEGREE - 1; l >= 1; --l) {
|
||||
Index lane_mask = 1 << (l - 1);
|
||||
Index thread_mask = (1 << l) - 1;
|
||||
twiddle_shift <<= 1;
|
||||
|
||||
w = negtwiddles[twid_id + 2];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 3
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 8);
|
||||
i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
|
||||
i2 = i1 + params::degree / 8;
|
||||
|
||||
w = negtwiddles[twid_id + 4];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 4
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 16);
|
||||
i1 =
|
||||
2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
|
||||
i2 = i1 + params::degree / 16;
|
||||
|
||||
w = negtwiddles[twid_id + 8];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 5
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 32);
|
||||
i1 =
|
||||
2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
|
||||
i2 = i1 + params::degree / 32;
|
||||
|
||||
w = negtwiddles[twid_id + 16];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 6
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 64);
|
||||
i1 =
|
||||
2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
|
||||
i2 = i1 + params::degree / 64;
|
||||
|
||||
w = negtwiddles[twid_id + 32];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 7
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 128);
|
||||
i1 = 2 * (params::degree / 128) * twid_id +
|
||||
(tid & (params::degree / 128 - 1));
|
||||
i2 = i1 + params::degree / 128;
|
||||
|
||||
w = negtwiddles[twid_id + 64];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// from level 8, we need to check size of params degree, because we support
|
||||
// minimum actual polynomial size = 256, when compressed size is halfed and
|
||||
// minimum supported compressed size is 128, so we always need first 7
|
||||
// levels of butterfly operation, since butterfly levels are hardcoded
|
||||
// we need to check if polynomial size is big enough to require specific level
|
||||
// of butterfly.
|
||||
if constexpr (params::degree >= 256) {
|
||||
// level 8
|
||||
tid = threadIdx.x;
|
||||
__syncthreads();
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 256);
|
||||
i1 = 2 * (params::degree / 256) * twid_id +
|
||||
(tid & (params::degree / 256 - 1));
|
||||
i2 = i1 + params::degree / 256;
|
||||
|
||||
w = negtwiddles[twid_id + 128];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
|
||||
Index rank = tid & thread_mask;
|
||||
bool u_stays_in_register = rank < lane_mask;
|
||||
A[tid] = (u_stays_in_register) ? v[i] : u[i];
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 512) {
|
||||
// level 9
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 512);
|
||||
i1 = 2 * (params::degree / 512) * twid_id +
|
||||
(tid & (params::degree / 512 - 1));
|
||||
i2 = i1 + params::degree / 512;
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
|
||||
Index rank = tid & thread_mask;
|
||||
bool u_stays_in_register = rank < lane_mask;
|
||||
w = A[tid ^ lane_mask];
|
||||
u[i] = (u_stays_in_register) ? u[i] : w;
|
||||
v[i] = (u_stays_in_register) ? w : v[i];
|
||||
w = negtwiddles[tid / lane_mask + twiddle_shift];
|
||||
|
||||
w = negtwiddles[twid_id + 256];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
w *= v[i];
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
v[i] = u[i] - w;
|
||||
u[i] = u[i] + w;
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if constexpr (params::degree >= 1024) {
|
||||
// level 10
|
||||
tid = threadIdx.x;
|
||||
// store registers in SM
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 1024);
|
||||
i1 = 2 * (params::degree / 1024) * twid_id +
|
||||
(tid & (params::degree / 1024 - 1));
|
||||
i2 = i1 + params::degree / 1024;
|
||||
|
||||
w = negtwiddles[twid_id + 512];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 2048) {
|
||||
// level 11
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 2048);
|
||||
i1 = 2 * (params::degree / 2048) * twid_id +
|
||||
(tid & (params::degree / 2048 - 1));
|
||||
i2 = i1 + params::degree / 2048;
|
||||
|
||||
w = negtwiddles[twid_id + 1024];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 4096) {
|
||||
// level 12
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 4096);
|
||||
i1 = 2 * (params::degree / 4096) * twid_id +
|
||||
(tid & (params::degree / 4096 - 1));
|
||||
i2 = i1 + params::degree / 4096;
|
||||
|
||||
w = negtwiddles[twid_id + 2048];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 8192) {
|
||||
// level 13
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 8192);
|
||||
i1 = 2 * (params::degree / 8192) * twid_id +
|
||||
(tid & (params::degree / 8192 - 1));
|
||||
i2 = i1 + params::degree / 8192;
|
||||
|
||||
w = negtwiddles[twid_id + 4096];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
|
||||
A[tid * 2] = u[i];
|
||||
A[tid * 2 + 1] = v[i];
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -329,284 +121,82 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
|
||||
* full loop, which should increase performance
|
||||
*/
|
||||
|
||||
__syncthreads();
|
||||
constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
|
||||
constexpr Index LOG2_DEGREE = params::log2_degree;
|
||||
constexpr Index DEGREE = params::degree;
|
||||
constexpr Index HALF_DEGREE = params::degree >> 1;
|
||||
constexpr Index STRIDE = params::degree / params::opt;
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
size_t twid_id;
|
||||
size_t i1, i2;
|
||||
double2 u, w;
|
||||
double2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;
|
||||
|
||||
// divide input by compressed polynomial size
|
||||
tid = threadIdx.x;
|
||||
for (size_t i = 0; i < params::opt; ++i) {
|
||||
A[tid] /= params::degree;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// none of the twiddles have equal real and imag part, so
|
||||
// complete complex multiplication has to be done
|
||||
// here we have more than one twiddle
|
||||
// mapping in backward fft is reversed
|
||||
// butterfly operation is started from last level
|
||||
|
||||
if constexpr (params::degree >= 8192) {
|
||||
// level 13
|
||||
tid = threadIdx.x;
|
||||
// load into registers and divide by compressed polynomial size
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 8192);
|
||||
i1 = 2 * (params::degree / 8192) * twid_id +
|
||||
(tid & (params::degree / 8192 - 1));
|
||||
i2 = i1 + params::degree / 8192;
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
|
||||
u[i] = A[2 * tid];
|
||||
v[i] = A[2 * tid + 1];
|
||||
|
||||
w = negtwiddles[twid_id + 4096];
|
||||
u = A[i1] - A[i2];
|
||||
u[i] /= DEGREE;
|
||||
v[i] /= DEGREE;
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
tid += STRIDE;
|
||||
}
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
Index twiddle_shift = DEGREE;
|
||||
for (Index l = 1; l <= LOG2_DEGREE - 1; ++l) {
|
||||
Index lane_mask = 1 << (l - 1);
|
||||
Index thread_mask = (1 << l) - 1;
|
||||
tid = threadIdx.x;
|
||||
twiddle_shift >>= 1;
|
||||
|
||||
// at this point registers are ready for the butterfly
|
||||
tid = threadIdx.x;
|
||||
__syncthreads();
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
|
||||
w = (u[i] - v[i]);
|
||||
u[i] += v[i];
|
||||
v[i] = w * conjugate(negtwiddles[tid / lane_mask + twiddle_shift]);
|
||||
|
||||
// keep one of the register for next iteration and store another one in sm
|
||||
Index rank = tid & thread_mask;
|
||||
bool u_stays_in_register = rank < lane_mask;
|
||||
A[tid] = (u_stays_in_register) ? v[i] : u[i];
|
||||
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 4096) {
|
||||
// level 12
|
||||
// prepare registers for next butterfly iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 4096);
|
||||
i1 = 2 * (params::degree / 4096) * twid_id +
|
||||
(tid & (params::degree / 4096 - 1));
|
||||
i2 = i1 + params::degree / 4096;
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
|
||||
Index rank = tid & thread_mask;
|
||||
bool u_stays_in_register = rank < lane_mask;
|
||||
w = A[tid ^ lane_mask];
|
||||
u[i] = (u_stays_in_register) ? u[i] : w;
|
||||
v[i] = (u_stays_in_register) ? w : v[i];
|
||||
|
||||
w = negtwiddles[twid_id + 2048];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 2048) {
|
||||
// level 11
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 2048);
|
||||
i1 = 2 * (params::degree / 2048) * twid_id +
|
||||
(tid & (params::degree / 2048 - 1));
|
||||
i2 = i1 + params::degree / 2048;
|
||||
|
||||
w = negtwiddles[twid_id + 1024];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 1024) {
|
||||
// level 10
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 1024);
|
||||
i1 = 2 * (params::degree / 1024) * twid_id +
|
||||
(tid & (params::degree / 1024 - 1));
|
||||
i2 = i1 + params::degree / 1024;
|
||||
|
||||
w = negtwiddles[twid_id + 512];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 512) {
|
||||
// level 9
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 512);
|
||||
i1 = 2 * (params::degree / 512) * twid_id +
|
||||
(tid & (params::degree / 512 - 1));
|
||||
i2 = i1 + params::degree / 512;
|
||||
|
||||
w = negtwiddles[twid_id + 256];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 256) {
|
||||
// level 8
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 256);
|
||||
i1 = 2 * (params::degree / 256) * twid_id +
|
||||
(tid & (params::degree / 256 - 1));
|
||||
i2 = i1 + params::degree / 256;
|
||||
|
||||
w = negtwiddles[twid_id + 128];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// below level 8, we don't need to check size of params degree, because we
|
||||
// support minimum actual polynomial size = 256, when compressed size is
|
||||
// halfed and minimum supported compressed size is 128, so we always need
|
||||
// last 7 levels of butterfly operation, since butterfly levels are hardcoded
|
||||
// we don't need to check if polynomial size is big enough to require
|
||||
// specific level of butterfly.
|
||||
// level 7
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 128);
|
||||
i1 = 2 * (params::degree / 128) * twid_id +
|
||||
(tid & (params::degree / 128 - 1));
|
||||
i2 = i1 + params::degree / 128;
|
||||
|
||||
w = negtwiddles[twid_id + 64];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
// last iteration
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
|
||||
w = (u[i] - v[i]);
|
||||
u[i] = u[i] + v[i];
|
||||
v[i] = w * (double2){0.707106781186547461715008466854,
|
||||
-0.707106781186547461715008466854};
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 6
|
||||
// store registers in SM
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 64);
|
||||
i1 =
|
||||
2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
|
||||
i2 = i1 + params::degree / 64;
|
||||
|
||||
w = negtwiddles[twid_id + 32];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 5
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 32);
|
||||
i1 =
|
||||
2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
|
||||
i2 = i1 + params::degree / 32;
|
||||
|
||||
w = negtwiddles[twid_id + 16];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 4
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 16);
|
||||
i1 =
|
||||
2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
|
||||
i2 = i1 + params::degree / 16;
|
||||
|
||||
w = negtwiddles[twid_id + 8];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 3
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 8);
|
||||
i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
|
||||
i2 = i1 + params::degree / 8;
|
||||
|
||||
w = negtwiddles[twid_id + 4];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 2
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 4);
|
||||
i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
|
||||
i2 = i1 + params::degree / 4;
|
||||
|
||||
w = negtwiddles[twid_id + 2];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 1
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 2);
|
||||
i1 = 2 * (params::degree / 2) * twid_id + (tid & (params::degree / 2 - 1));
|
||||
i2 = i1 + params::degree / 2;
|
||||
|
||||
w = negtwiddles[twid_id + 1];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
|
||||
A[tid] = u[i];
|
||||
A[tid + HALF_DEGREE] = v[i];
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@@ -38,12 +37,12 @@ void host_resolve_signed_overflow(
|
||||
streams[0], gpu_indexes[0], x, last_block_output_carry, d_clears,
|
||||
mem->params.big_lwe_dimension, 1);
|
||||
|
||||
host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
|
||||
last_block_inner_propagation, x, mem->params.big_lwe_dimension,
|
||||
1);
|
||||
host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
|
||||
last_block_inner_propagation, last_block_input_carry,
|
||||
mem->params.big_lwe_dimension, 1);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
|
||||
last_block_inner_propagation, x,
|
||||
mem->params.big_lwe_dimension, 1);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
|
||||
last_block_inner_propagation, last_block_input_carry,
|
||||
mem->params.big_lwe_dimension, 1);
|
||||
|
||||
host_apply_univariate_lut_kb<Torus>(streams, gpu_indexes, gpu_count, result,
|
||||
last_block_inner_propagation,
|
||||
@@ -85,6 +84,7 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
|
||||
assert(radix_params.message_modulus >= 4 && radix_params.carry_modulus >= 4);
|
||||
|
||||
auto result = mem_ptr->result;
|
||||
auto neg_rhs = mem_ptr->neg_rhs;
|
||||
auto input_carries = mem_ptr->input_carries;
|
||||
auto output_carry = mem_ptr->output_carry;
|
||||
auto last_block_inner_propagation = mem_ptr->last_block_inner_propagation;
|
||||
@@ -94,11 +94,14 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
|
||||
|
||||
// phase 1
|
||||
if (op == SIGNED_OPERATION::ADDITION) {
|
||||
host_addition(streams[0], gpu_indexes[0], result, lhs, rhs,
|
||||
big_lwe_dimension, num_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, rhs,
|
||||
big_lwe_dimension, num_blocks);
|
||||
} else {
|
||||
host_subtraction(streams[0], gpu_indexes[0], result, lhs, rhs,
|
||||
big_lwe_dimension, num_blocks);
|
||||
host_integer_radix_negation<Torus>(
|
||||
streams, gpu_indexes, gpu_count, neg_rhs, rhs, big_lwe_dimension,
|
||||
num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
|
||||
big_lwe_dimension, num_blocks);
|
||||
}
|
||||
|
||||
// phase 2
|
||||
@@ -106,28 +109,16 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
// generate input_carries and output_carry
|
||||
#pragma omp section
|
||||
{
|
||||
host_propagate_single_carry(
|
||||
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, result, output_carry,
|
||||
input_carries, mem_ptr->scp_mem, bsks, ksks, num_blocks);
|
||||
}
|
||||
host_propagate_single_carry<Torus>(
|
||||
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, result, output_carry,
|
||||
input_carries, mem_ptr->scp_mem, bsks, ksks, num_blocks);
|
||||
host_generate_last_block_inner_propagation<Torus>(
|
||||
mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
|
||||
last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size],
|
||||
&rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, bsks,
|
||||
ksks);
|
||||
|
||||
// generate generate_last_block_inner_propagation
|
||||
#pragma omp section
|
||||
{
|
||||
host_generate_last_block_inner_propagation(
|
||||
mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
|
||||
last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size],
|
||||
&rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem,
|
||||
bsks, ksks);
|
||||
}
|
||||
}
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
@@ -135,7 +126,7 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
|
||||
// phase 3
|
||||
auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size];
|
||||
|
||||
host_resolve_signed_overflow(
|
||||
host_resolve_signed_overflow<Torus>(
|
||||
streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation,
|
||||
input_carry, output_carry, mem_ptr->resolve_overflow_mem, bsks, ksks);
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ void scratch_cuda_integer_radix_cmux_kb_64(
|
||||
std::function<uint64_t(uint64_t)> predicate_lut_f =
|
||||
[](uint64_t x) -> uint64_t { return x == 1; };
|
||||
|
||||
scratch_cuda_integer_radix_cmux_kb(
|
||||
scratch_cuda_integer_radix_cmux_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
|
||||
lwe_ciphertext_count, params, allocate_gpu_memory);
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
#define CUDA_INTEGER_CMUX_CUH
|
||||
|
||||
#include "integer.cuh"
|
||||
#include <omp.h>
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
@@ -28,10 +27,11 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
|
||||
auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;
|
||||
|
||||
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
|
||||
lwe_array_out_block, predicate->lwe_indexes_in, lwe_array_input_block,
|
||||
lwe_condition, predicate->lwe_indexes_in, params.big_lwe_dimension,
|
||||
params.message_modulus, 1);
|
||||
device_pack_bivariate_blocks<Torus>
|
||||
<<<num_blocks, num_threads, 0, streams[0]>>>(
|
||||
lwe_array_out_block, predicate->lwe_indexes_in,
|
||||
lwe_array_input_block, lwe_condition, predicate->lwe_indexes_in,
|
||||
params.big_lwe_dimension, params.message_modulus, 1);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -57,27 +57,20 @@ __host__ void host_integer_radix_cmux_kb(
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
// Both sections may be executed in parallel
|
||||
#pragma omp section
|
||||
{
|
||||
auto mem_true = mem_ptr->zero_if_true_buffer;
|
||||
zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
|
||||
lwe_array_true, lwe_condition, mem_true,
|
||||
mem_ptr->inverted_predicate_lut, bsks, ksks,
|
||||
num_radix_blocks);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
auto mem_false = mem_ptr->zero_if_false_buffer;
|
||||
zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
|
||||
lwe_array_false, lwe_condition, mem_false,
|
||||
mem_ptr->predicate_lut, bsks, ksks, num_radix_blocks);
|
||||
}
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
auto mem_true = mem_ptr->zero_if_true_buffer;
|
||||
zero_out_if<Torus>(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
|
||||
lwe_array_true, lwe_condition, mem_true,
|
||||
mem_ptr->inverted_predicate_lut, bsks, ksks,
|
||||
num_radix_blocks);
|
||||
auto mem_false = mem_ptr->zero_if_false_buffer;
|
||||
zero_out_if<Torus>(false_streams, gpu_indexes, gpu_count,
|
||||
mem_ptr->tmp_false_ct, lwe_array_false, lwe_condition,
|
||||
mem_false, mem_ptr->predicate_lut, bsks, ksks,
|
||||
num_radix_blocks);
|
||||
for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
for (uint j = 0; j < mem_ptr->zero_if_false_buffer->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(false_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
@@ -85,9 +78,9 @@ __host__ void host_integer_radix_cmux_kb(
|
||||
// will be 0 If the condition was false, true_ct will be 0 and false_ct will
|
||||
// have kept its value
|
||||
auto added_cts = mem_ptr->tmp_true_ct;
|
||||
host_addition(streams[0], gpu_indexes[0], added_cts, mem_ptr->tmp_true_ct,
|
||||
mem_ptr->tmp_false_ct, params.big_lwe_dimension,
|
||||
num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], added_cts,
|
||||
mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
|
||||
params.big_lwe_dimension, num_radix_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
|
||||
|
||||
@@ -43,7 +43,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
|
||||
int num_entries = (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
// Add all blocks and store in sum
|
||||
device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream>>>(
|
||||
device_accumulate_all_blocks<Torus><<<num_blocks, num_threads, 0, stream>>>(
|
||||
output, input, lwe_dimension, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -62,7 +62,6 @@ __host__ void are_all_comparisons_block_true(
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -75,7 +74,7 @@ __host__ void are_all_comparisons_block_true(
|
||||
auto tmp_out = are_all_block_true_buffer->tmp_out;
|
||||
|
||||
uint32_t total_modulus = message_modulus * carry_modulus;
|
||||
uint32_t max_value = total_modulus - 1;
|
||||
uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(tmp_out, lwe_array_in,
|
||||
num_radix_blocks * (big_lwe_dimension + 1) *
|
||||
@@ -96,8 +95,9 @@ __host__ void are_all_comparisons_block_true(
|
||||
auto is_equal_to_num_blocks_map =
|
||||
&are_all_block_true_buffer->is_equal_to_lut_map;
|
||||
for (int i = 0; i < num_chunks; i++) {
|
||||
accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
|
||||
input_blocks, big_lwe_dimension, chunk_length);
|
||||
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
|
||||
input_blocks, big_lwe_dimension,
|
||||
chunk_length);
|
||||
|
||||
accumulator += (big_lwe_dimension + 1);
|
||||
remaining_blocks -= (chunk_length - 1);
|
||||
@@ -121,9 +121,8 @@ __host__ void are_all_comparisons_block_true(
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
max_value, num_radix_blocks, true);
|
||||
|
||||
auto is_equal_to_num_blocks_lut_f = [max_value,
|
||||
chunk_length](Torus x) -> Torus {
|
||||
return (x & max_value) == chunk_length;
|
||||
auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
|
||||
return x == chunk_length;
|
||||
};
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], new_lut->get_lut(gpu_indexes[0], 0),
|
||||
@@ -165,7 +164,6 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
@@ -174,7 +172,7 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
auto buffer = mem_ptr->eq_buffer->are_all_block_true_buffer;
|
||||
|
||||
uint32_t total_modulus = message_modulus * carry_modulus;
|
||||
uint32_t max_value = total_modulus - 1;
|
||||
uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(mem_ptr->tmp_lwe_array_out, lwe_array_in,
|
||||
num_radix_blocks * (big_lwe_dimension + 1) *
|
||||
@@ -192,8 +190,9 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
auto input_blocks = mem_ptr->tmp_lwe_array_out;
|
||||
auto accumulator = buffer->tmp_block_accumulated;
|
||||
for (int i = 0; i < num_chunks; i++) {
|
||||
accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
|
||||
input_blocks, big_lwe_dimension, chunk_length);
|
||||
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
|
||||
input_blocks, big_lwe_dimension,
|
||||
chunk_length);
|
||||
|
||||
accumulator += (big_lwe_dimension + 1);
|
||||
remaining_blocks -= (chunk_length - 1);
|
||||
@@ -245,7 +244,6 @@ __host__ void host_compare_with_zero_equality(
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
@@ -281,8 +279,8 @@ __host__ void host_compare_with_zero_equality(
|
||||
uint32_t chunk_size =
|
||||
std::min(remainder_blocks, num_elements_to_fill_carry);
|
||||
|
||||
accumulate_all_blocks(streams[0], gpu_indexes[0], sum_i, chunk,
|
||||
big_lwe_dimension, chunk_size);
|
||||
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], sum_i, chunk,
|
||||
big_lwe_dimension, chunk_size);
|
||||
|
||||
num_sum_blocks++;
|
||||
remainder_blocks -= (chunk_size - 1);
|
||||
@@ -296,8 +294,9 @@ __host__ void host_compare_with_zero_equality(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks,
|
||||
zero_comparison);
|
||||
are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
sum, mem_ptr, bsks, ksks, num_sum_blocks);
|
||||
are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
|
||||
lwe_array_out, sum, mem_ptr, bsks, ksks,
|
||||
num_sum_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -311,7 +310,7 @@ __host__ void host_integer_radix_equality_check_kb(
|
||||
|
||||
// Applies the LUT for the comparison operation
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
|
||||
bsks, ksks, num_radix_blocks, eq_buffer->operator_lut,
|
||||
eq_buffer->operator_lut->params.message_modulus);
|
||||
@@ -320,9 +319,9 @@ __host__ void host_integer_radix_equality_check_kb(
|
||||
//
|
||||
// It returns a block encrypting 1 if all input blocks are 1
|
||||
// otherwise the block encrypts 0
|
||||
are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
comparisons, mem_ptr, bsks, ksks,
|
||||
num_radix_blocks);
|
||||
are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
|
||||
lwe_array_out, comparisons, mem_ptr,
|
||||
bsks, ksks, num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -353,19 +352,20 @@ compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
// Subtract
|
||||
// Here we need the true lwe sub, not the one that comes from shortint.
|
||||
host_subtraction(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, big_lwe_dimension, num_radix_blocks);
|
||||
host_subtraction<Torus>(streams[0], gpu_indexes[0], lwe_array_out,
|
||||
lwe_array_left, lwe_array_right, big_lwe_dimension,
|
||||
num_radix_blocks);
|
||||
|
||||
// Apply LUT to compare to 0
|
||||
auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
|
||||
num_radix_blocks, is_non_zero_lut);
|
||||
|
||||
// Add one
|
||||
// Here Lhs can have the following values: (-1) % (message modulus * carry
|
||||
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
|
||||
host_integer_radix_add_scalar_one_inplace(
|
||||
host_integer_radix_add_scalar_one_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
|
||||
num_radix_blocks, message_modulus, carry_modulus);
|
||||
}
|
||||
@@ -407,8 +407,8 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
|
||||
while (partial_block_count > 2) {
|
||||
pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
|
||||
partial_block_count, 4);
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
|
||||
partial_block_count, 4);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
|
||||
@@ -434,8 +434,8 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
std::function<Torus(Torus)> f;
|
||||
|
||||
if (partial_block_count == 2) {
|
||||
pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
|
||||
partial_block_count, 4);
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
|
||||
partial_block_count, 4);
|
||||
|
||||
f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
|
||||
int msb = (x >> 2) & 3;
|
||||
@@ -455,9 +455,9 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
last_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
// Last leaf
|
||||
integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
|
||||
gpu_count, lwe_array_out, y,
|
||||
bsks, ksks, 1, last_lut);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks, 1,
|
||||
last_lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -489,19 +489,21 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
if (mem_ptr->is_signed) {
|
||||
packed_num_radix_blocks -= 2;
|
||||
}
|
||||
pack_blocks(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
|
||||
big_lwe_dimension, packed_num_radix_blocks, message_modulus);
|
||||
pack_blocks(streams[0], gpu_indexes[0], packed_right, lwe_array_right,
|
||||
big_lwe_dimension, packed_num_radix_blocks, message_modulus);
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
|
||||
big_lwe_dimension, packed_num_radix_blocks,
|
||||
message_modulus);
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], packed_right,
|
||||
lwe_array_right, big_lwe_dimension,
|
||||
packed_num_radix_blocks, message_modulus);
|
||||
// From this point we have half number of blocks
|
||||
packed_num_radix_blocks /= 2;
|
||||
|
||||
// Clean noise
|
||||
auto identity_lut = mem_ptr->identity_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks,
|
||||
packed_num_radix_blocks, identity_lut);
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, packed_right, packed_right, bsks, ksks,
|
||||
packed_num_radix_blocks, identity_lut);
|
||||
|
||||
@@ -518,16 +520,17 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
if (!mem_ptr->is_signed) {
|
||||
// Compare packed blocks, or simply the total number of radix blocks in the
|
||||
// inputs
|
||||
compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
|
||||
rhs, mem_ptr, bsks, ksks, packed_num_radix_blocks);
|
||||
compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count, comparisons,
|
||||
lhs, rhs, mem_ptr, bsks, ksks,
|
||||
packed_num_radix_blocks);
|
||||
num_comparisons = packed_num_radix_blocks;
|
||||
} else {
|
||||
// Packing is possible
|
||||
if (carry_modulus >= message_modulus) {
|
||||
// Compare (num_radix_blocks - 2) / 2 packed blocks
|
||||
compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
|
||||
rhs, mem_ptr, bsks, ksks,
|
||||
packed_num_radix_blocks);
|
||||
compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count,
|
||||
comparisons, lhs, rhs, mem_ptr, bsks, ksks,
|
||||
packed_num_radix_blocks);
|
||||
|
||||
// Compare the last block before the sign block separately
|
||||
auto identity_lut = mem_ptr->identity_lut;
|
||||
@@ -536,21 +539,21 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
Torus *last_right_block_before_sign_block =
|
||||
diff_buffer->tmp_packed_right +
|
||||
packed_num_radix_blocks * big_lwe_size;
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
|
||||
lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1,
|
||||
identity_lut);
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, last_right_block_before_sign_block,
|
||||
lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks,
|
||||
1, identity_lut);
|
||||
compare_radix_blocks_kb(
|
||||
compare_radix_blocks_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
comparisons + packed_num_radix_blocks * big_lwe_size,
|
||||
last_left_block_before_sign_block, last_right_block_before_sign_block,
|
||||
mem_ptr, bsks, ksks, 1);
|
||||
// Compare the sign block separately
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
|
||||
lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
|
||||
@@ -559,11 +562,11 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
num_comparisons = packed_num_radix_blocks + 2;
|
||||
|
||||
} else {
|
||||
compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
|
||||
lwe_array_left, lwe_array_right, mem_ptr, bsks,
|
||||
ksks, num_radix_blocks - 1);
|
||||
compare_radix_blocks_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, comparisons, lwe_array_left,
|
||||
lwe_array_right, mem_ptr, bsks, ksks, num_radix_blocks - 1);
|
||||
// Compare the sign block separately
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
comparisons + (num_radix_blocks - 1) * big_lwe_size,
|
||||
lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
|
||||
@@ -576,9 +579,9 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
reduction_lut_f, bsks, ksks, num_comparisons);
|
||||
tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
reduction_lut_f, bsks, ksks, num_comparisons);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -602,16 +605,16 @@ host_integer_radix_maxmin_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
Torus **ksks, uint32_t total_num_radix_blocks) {
|
||||
|
||||
// Compute the sign
|
||||
host_integer_radix_difference_check_kb(
|
||||
host_integer_radix_difference_check_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
||||
lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsks,
|
||||
ksks, total_num_radix_blocks);
|
||||
|
||||
// Selector
|
||||
host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
|
||||
total_num_radix_blocks);
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
|
||||
mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,89 @@
|
||||
#include "compression.cuh"
|
||||
|
||||
void scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params compression_params(
|
||||
pbs_type, compression_glwe_dimension, compression_polynomial_size,
|
||||
(compression_glwe_dimension + 1) * compression_polynomial_size,
|
||||
lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
|
||||
carry_modulus);
|
||||
|
||||
scratch_cuda_compress_integer_radix_ciphertext<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_compression<uint64_t> **)mem_ptr, num_radix_blocks,
|
||||
compression_params, lwe_per_glwe, storage_log_modulus,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
void scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
// Decompression doesn't keyswitch, so big and small dimensions are the same
|
||||
int_radix_params encryption_params(
|
||||
pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
|
||||
lwe_dimension, lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
int_radix_params compression_params(
|
||||
pbs_type, compression_glwe_dimension, compression_polynomial_size,
|
||||
lwe_dimension, compression_glwe_dimension * compression_polynomial_size,
|
||||
0, 0, pbs_level, pbs_base_log, 0, message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_integer_decompress_radix_ciphertext<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_decompression<uint64_t> **)mem_ptr, num_radix_blocks, body_count,
|
||||
encryption_params, compression_params, storage_log_modulus,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
void cuda_integer_compress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
|
||||
int8_t *mem_ptr) {
|
||||
|
||||
host_integer_compress<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(glwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in), (uint64_t **)(fp_ksk), num_nths,
|
||||
(int_compression<uint64_t> *)mem_ptr);
|
||||
}
|
||||
void cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *glwe_in, uint32_t *indexes_array,
|
||||
uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr) {
|
||||
|
||||
host_integer_decompress<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out), static_cast<uint64_t *>(glwe_in),
|
||||
indexes_array, indexes_array_size, bsks,
|
||||
(int_decompression<uint64_t> *)mem_ptr);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_compression<uint64_t> *mem_ptr =
|
||||
(int_compression<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_decompression<uint64_t> *mem_ptr =
|
||||
(int_decompression<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
@@ -0,0 +1,390 @@
|
||||
#ifndef CUDA_INTEGER_COMPRESSION_CUH
|
||||
#define CUDA_INTEGER_COMPRESSION_CUH
|
||||
|
||||
#include "ciphertext.h"
|
||||
#include "compression.h"
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer/integer.cuh"
|
||||
#include "linearalgebra/multiplication.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
|
||||
uint32_t num_coeffs, uint32_t in_len, uint32_t out_len) {
|
||||
auto nbits = sizeof(Torus) * 8;
|
||||
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
auto glwe_index = tid / out_len;
|
||||
auto i = tid % out_len;
|
||||
auto chunk_array_in = array_in + glwe_index * in_len;
|
||||
auto chunk_array_out = array_out + glwe_index * out_len;
|
||||
|
||||
if (tid < num_coeffs) {
|
||||
|
||||
auto k = nbits * i / log_modulus;
|
||||
auto j = k;
|
||||
|
||||
auto start_shift = i * nbits - j * log_modulus;
|
||||
|
||||
auto value = chunk_array_in[j] >> start_shift;
|
||||
j++;
|
||||
|
||||
while (j * log_modulus < ((i + 1) * nbits) && j < in_len) {
|
||||
auto shift = j * log_modulus - i * nbits;
|
||||
value |= chunk_array_in[j] << shift;
|
||||
j++;
|
||||
}
|
||||
|
||||
chunk_array_out[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *array_out, Torus *array_in, uint32_t num_glwes,
|
||||
uint32_t num_lwes, int_compression<Torus> *mem_ptr) {
|
||||
if (array_in == array_out)
|
||||
PANIC("Cuda error: Input and output must be different");
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
auto compression_params = mem_ptr->compression_params;
|
||||
|
||||
auto log_modulus = mem_ptr->storage_log_modulus;
|
||||
// [0..num_glwes-1) GLWEs
|
||||
auto in_len = (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size;
|
||||
auto number_bits_to_pack = in_len * log_modulus;
|
||||
auto nbits = sizeof(Torus) * 8;
|
||||
// number_bits_to_pack.div_ceil(Scalar::BITS)
|
||||
auto out_len = (number_bits_to_pack + nbits - 1) / nbits;
|
||||
|
||||
// Last GLWE
|
||||
auto last_body_count = num_lwes % compression_params.polynomial_size;
|
||||
in_len =
|
||||
compression_params.glwe_dimension * compression_params.polynomial_size +
|
||||
last_body_count;
|
||||
number_bits_to_pack = in_len * log_modulus;
|
||||
auto last_out_len = (number_bits_to_pack + nbits - 1) / nbits;
|
||||
|
||||
auto num_coeffs = (num_glwes - 1) * out_len + last_out_len;
|
||||
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
getNumBlocksAndThreads(num_coeffs, 1024, num_blocks, num_threads);
|
||||
|
||||
dim3 grid(num_blocks);
|
||||
dim3 threads(num_threads);
|
||||
cuda_memset_async(array_out, 0,
|
||||
num_glwes * (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size * sizeof(Torus),
|
||||
stream, gpu_index);
|
||||
pack<Torus><<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus,
|
||||
num_coeffs, in_len, out_len);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_compress(cudaStream_t *streams,
|
||||
uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *glwe_array_out, Torus *lwe_array_in,
|
||||
Torus **fp_ksk, uint32_t num_radix_blocks,
|
||||
int_compression<Torus> *mem_ptr) {
|
||||
|
||||
auto compression_params = mem_ptr->compression_params;
|
||||
auto input_lwe_dimension = compression_params.small_lwe_dimension;
|
||||
|
||||
// Shift
|
||||
auto lwe_shifted = mem_ptr->tmp_lwe;
|
||||
host_cleartext_multiplication<Torus>(
|
||||
streams[0], gpu_indexes[0], lwe_shifted, lwe_array_in,
|
||||
(uint64_t)compression_params.message_modulus, input_lwe_dimension,
|
||||
num_radix_blocks);
|
||||
|
||||
uint32_t lwe_in_size = input_lwe_dimension + 1;
|
||||
uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size;
|
||||
uint32_t num_glwes_for_compression =
|
||||
num_radix_blocks / mem_ptr->lwe_per_glwe + 1;
|
||||
|
||||
// Keyswitch LWEs to GLWE
|
||||
auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
|
||||
cuda_memset_async(tmp_glwe_array_out, 0,
|
||||
num_glwes_for_compression *
|
||||
(compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
auto fp_ks_buffer = mem_ptr->fp_ks_buffer;
|
||||
auto rem_lwes = num_radix_blocks;
|
||||
|
||||
auto lwe_subset = lwe_shifted;
|
||||
auto glwe_out = tmp_glwe_array_out;
|
||||
while (rem_lwes > 0) {
|
||||
auto chunk_size = min(rem_lwes, mem_ptr->lwe_per_glwe);
|
||||
|
||||
host_packing_keyswitch_lwe_list_to_glwe<Torus>(
|
||||
streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
|
||||
fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
|
||||
compression_params.polynomial_size, compression_params.ks_base_log,
|
||||
compression_params.ks_level, chunk_size);
|
||||
|
||||
rem_lwes -= chunk_size;
|
||||
lwe_subset += chunk_size * lwe_in_size;
|
||||
glwe_out += glwe_out_size;
|
||||
}
|
||||
|
||||
// Modulus switch
|
||||
host_modulus_switch_inplace<Torus>(
|
||||
streams[0], gpu_indexes[0], tmp_glwe_array_out,
|
||||
num_glwes_for_compression * (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size,
|
||||
mem_ptr->storage_log_modulus);
|
||||
|
||||
host_pack<Torus>(streams[0], gpu_indexes[0], glwe_array_out,
|
||||
tmp_glwe_array_out, num_glwes_for_compression,
|
||||
num_radix_blocks, mem_ptr);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index,
|
||||
uint32_t log_modulus, uint32_t input_len,
|
||||
uint32_t initial_out_len) {
|
||||
auto nbits = sizeof(Torus) * 8;
|
||||
|
||||
auto i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
auto chunk_array_in = array_in + index * input_len;
|
||||
if (i < initial_out_len) {
|
||||
// Unpack
|
||||
Torus mask = ((Torus)1 << log_modulus) - 1;
|
||||
auto start = i * log_modulus;
|
||||
auto end = (i + 1) * log_modulus;
|
||||
|
||||
auto start_block = start / nbits;
|
||||
auto start_remainder = start % nbits;
|
||||
|
||||
auto end_block_inclusive = (end - 1) / nbits;
|
||||
|
||||
Torus unpacked_i;
|
||||
if (start_block == end_block_inclusive) {
|
||||
auto single_part = chunk_array_in[start_block] >> start_remainder;
|
||||
unpacked_i = single_part & mask;
|
||||
} else {
|
||||
auto first_part = chunk_array_in[start_block] >> start_remainder;
|
||||
auto second_part = chunk_array_in[start_block + 1]
|
||||
<< (nbits - start_remainder);
|
||||
|
||||
unpacked_i = (first_part | second_part) & mask;
|
||||
}
|
||||
|
||||
// Extract
|
||||
glwe_array_out[i] = unpacked_i << (nbits - log_modulus);
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts the glwe_index-nth GLWE ciphertext
|
||||
template <typename Torus>
|
||||
__host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *glwe_array_out, Torus *array_in,
|
||||
uint32_t glwe_index,
|
||||
int_decompression<Torus> *mem_ptr) {
|
||||
if (array_in == glwe_array_out)
|
||||
PANIC("Cuda error: Input and output must be different");
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
auto compression_params = mem_ptr->compression_params;
|
||||
|
||||
auto log_modulus = mem_ptr->storage_log_modulus;
|
||||
|
||||
uint32_t body_count =
|
||||
std::min(mem_ptr->body_count, compression_params.polynomial_size);
|
||||
auto initial_out_len =
|
||||
compression_params.glwe_dimension * compression_params.polynomial_size +
|
||||
body_count;
|
||||
|
||||
auto compressed_glwe_accumulator_size =
|
||||
(compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size;
|
||||
auto number_bits_to_unpack = compressed_glwe_accumulator_size * log_modulus;
|
||||
auto nbits = sizeof(Torus) * 8;
|
||||
// number_bits_to_unpack.div_ceil(Scalar::BITS)
|
||||
auto input_len = (number_bits_to_unpack + nbits - 1) / nbits;
|
||||
|
||||
// We assure the tail of the glwe is zeroed
|
||||
auto zeroed_slice = glwe_array_out + initial_out_len;
|
||||
cuda_memset_async(zeroed_slice, 0,
|
||||
(compression_params.polynomial_size - body_count) *
|
||||
sizeof(Torus),
|
||||
stream, gpu_index);
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks);
|
||||
dim3 threads(num_threads);
|
||||
extract<Torus><<<grid, threads, 0, stream>>>(glwe_array_out, array_in,
|
||||
glwe_index, log_modulus,
|
||||
input_len, initial_out_len);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *d_lwe_array_out,
|
||||
Torus *d_packed_glwe_in, uint32_t *h_indexes_array,
|
||||
uint32_t indexes_array_size, void **d_bsks,
|
||||
int_decompression<Torus> *h_mem_ptr) {
|
||||
|
||||
auto d_indexes_array = h_mem_ptr->tmp_indexes_array;
|
||||
cuda_memcpy_async_to_gpu(d_indexes_array, h_indexes_array,
|
||||
indexes_array_size * sizeof(uint32_t), streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
auto compression_params = h_mem_ptr->compression_params;
|
||||
auto lwe_per_glwe = compression_params.polynomial_size;
|
||||
if (indexes_array_size > lwe_per_glwe)
|
||||
PANIC("Cuda error: too many LWEs to decompress. The number of LWEs should "
|
||||
"be smaller than "
|
||||
"polynomial_size.")
|
||||
|
||||
auto num_radix_blocks = h_mem_ptr->num_radix_blocks;
|
||||
if (num_radix_blocks != indexes_array_size)
|
||||
PANIC("Cuda error: wrong number of LWEs in decompress: the number of LWEs "
|
||||
"should be the same as indexes_array_size.")
|
||||
|
||||
// the first element is the last index in h_indexes_array that lies in the
|
||||
// related GLWE
|
||||
std::vector<std::pair<int, Torus *>> glwe_vec;
|
||||
|
||||
// Extract all GLWEs
|
||||
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size;
|
||||
|
||||
auto current_glwe_index = h_indexes_array[0] / lwe_per_glwe;
|
||||
auto extracted_glwe = h_mem_ptr->tmp_extracted_glwe;
|
||||
host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
|
||||
d_packed_glwe_in, current_glwe_index, h_mem_ptr);
|
||||
glwe_vec.push_back(std::make_pair(0, extracted_glwe));
|
||||
for (int i = 1; i < indexes_array_size; i++) {
|
||||
auto glwe_index = h_indexes_array[i] / lwe_per_glwe;
|
||||
if (glwe_index != current_glwe_index) {
|
||||
extracted_glwe += glwe_accumulator_size;
|
||||
current_glwe_index = glwe_index;
|
||||
// Extracts a new GLWE
|
||||
host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
|
||||
d_packed_glwe_in, glwe_index, h_mem_ptr);
|
||||
glwe_vec.push_back(std::make_pair(i, extracted_glwe));
|
||||
} else {
|
||||
// Updates the index
|
||||
glwe_vec.back().first++;
|
||||
}
|
||||
}
|
||||
// Sample extract all LWEs
|
||||
Torus lwe_accumulator_size = compression_params.small_lwe_dimension + 1;
|
||||
|
||||
auto extracted_lwe = h_mem_ptr->tmp_extracted_lwe;
|
||||
uint32_t current_idx = 0;
|
||||
auto d_indexes_array_chunk = d_indexes_array;
|
||||
for (const auto &max_idx_and_glwe : glwe_vec) {
|
||||
uint32_t last_idx = max_idx_and_glwe.first;
|
||||
extracted_glwe = max_idx_and_glwe.second;
|
||||
|
||||
auto num_lwes = last_idx + 1 - current_idx;
|
||||
cuda_glwe_sample_extract_64(streams[0], gpu_indexes[0], extracted_lwe,
|
||||
extracted_glwe, d_indexes_array_chunk, num_lwes,
|
||||
compression_params.glwe_dimension,
|
||||
compression_params.polynomial_size);
|
||||
d_indexes_array_chunk += num_lwes;
|
||||
extracted_lwe += lwe_accumulator_size;
|
||||
current_idx = last_idx;
|
||||
}
|
||||
|
||||
// Reset
|
||||
extracted_lwe = h_mem_ptr->tmp_extracted_lwe;
|
||||
|
||||
// In the case of extracting a single LWE these parameters are dummy
|
||||
uint32_t lut_count = 1;
|
||||
uint32_t lut_stride = 0;
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
auto encryption_params = h_mem_ptr->encryption_params;
|
||||
auto lut = h_mem_ptr->carry_extract_lut;
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
if (active_gpu_count == 1) {
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, d_lwe_array_out,
|
||||
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
|
||||
lut->lwe_indexes_in, d_bsks, lut->buffer,
|
||||
encryption_params.glwe_dimension,
|
||||
compression_params.small_lwe_dimension,
|
||||
encryption_params.polynomial_size, encryption_params.pbs_base_log,
|
||||
encryption_params.pbs_level, encryption_params.grouping_factor,
|
||||
num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride,
|
||||
false);
|
||||
} else {
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
|
||||
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
|
||||
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
/// With multiple GPUs we push to the vectors on each GPU then when we
|
||||
/// gather data to GPU 0 we can copy back to the original indexing
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
|
||||
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
|
||||
compression_params.small_lwe_dimension + 1);
|
||||
|
||||
/// Apply PBS
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
|
||||
lwe_array_in_vec, lwe_trivial_indexes_vec, d_bsks, lut->buffer,
|
||||
encryption_params.glwe_dimension,
|
||||
compression_params.small_lwe_dimension,
|
||||
encryption_params.polynomial_size, encryption_params.pbs_base_log,
|
||||
encryption_params.pbs_level, encryption_params.grouping_factor,
|
||||
num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride,
|
||||
false);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, d_lwe_array_out,
|
||||
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
|
||||
lut->using_trivial_lwe_indexes, num_radix_blocks,
|
||||
encryption_params.big_lwe_dimension + 1);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_compress_integer_radix_ciphertext(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_compression<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params compression_params, uint32_t lwe_per_glwe,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_compression<Torus>(
|
||||
streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks,
|
||||
lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_decompress_radix_ciphertext(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_decompression<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
uint32_t body_count, int_radix_params encryption_params,
|
||||
int_radix_params compression_params, uint32_t storage_log_modulus,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_decompression<Torus>(
|
||||
streams, gpu_indexes, gpu_count, encryption_params, compression_params,
|
||||
num_radix_blocks, body_count, storage_log_modulus, allocate_gpu_memory);
|
||||
}
|
||||
#endif
|
||||
@@ -14,7 +14,6 @@
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@@ -31,17 +30,13 @@ template <typename Torus> struct lwe_ciphertext_list {
|
||||
int_radix_params params;
|
||||
|
||||
size_t big_lwe_size;
|
||||
size_t radix_size;
|
||||
size_t big_lwe_size_bytes;
|
||||
size_t radix_size_bytes;
|
||||
size_t big_lwe_dimension;
|
||||
|
||||
lwe_ciphertext_list(Torus *src, int_radix_params params, size_t max_blocks)
|
||||
: data(src), params(params), max_blocks(max_blocks) {
|
||||
big_lwe_size = params.big_lwe_dimension + 1;
|
||||
big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
radix_size = max_blocks * big_lwe_size;
|
||||
radix_size_bytes = radix_size * sizeof(Torus);
|
||||
big_lwe_dimension = params.big_lwe_dimension;
|
||||
len = max_blocks;
|
||||
}
|
||||
@@ -287,7 +282,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// Shift the mask so that we will only keep bits we should
|
||||
uint32_t shifted_mask = full_message_mask >> shift_amount;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
|
||||
interesting_divisor.last_block(), bsks, ksks, 1,
|
||||
mem_ptr->masking_luts_1[shifted_mask]);
|
||||
@@ -315,7 +310,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// the estimated degree of the output is < msg_modulus
|
||||
shifted_mask = shifted_mask & full_message_mask;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
|
||||
divisor_ms_blocks.first_block(), bsks, ksks, 1,
|
||||
mem_ptr->masking_luts_2[shifted_mask]);
|
||||
@@ -339,7 +334,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
interesting_remainder1.insert(0, numerator_block_1.first_block(),
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
|
||||
mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);
|
||||
|
||||
@@ -347,7 +342,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
interesting_remainder1.len - 1, streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
host_radix_blocks_rotate_left(
|
||||
host_radix_blocks_rotate_left<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder1.data,
|
||||
tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size);
|
||||
|
||||
@@ -368,7 +363,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
auto left_shift_interesting_remainder2 =
|
||||
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
|
||||
mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
|
||||
}; // left_shift_interesting_remainder2
|
||||
@@ -376,35 +371,19 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
#pragma omp section
|
||||
{
|
||||
// interesting_divisor
|
||||
trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
|
||||
gpu_count);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// divisor_ms_blocks
|
||||
trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes,
|
||||
gpu_count);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// interesting_remainder1
|
||||
// numerator_block_stack
|
||||
left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes,
|
||||
gpu_count);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// interesting_remainder2
|
||||
left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
|
||||
gpu_count);
|
||||
}
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
// interesting_divisor
|
||||
trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
|
||||
gpu_count);
|
||||
// divisor_ms_blocks
|
||||
trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
|
||||
// interesting_remainder1
|
||||
// numerator_block_stack
|
||||
left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes,
|
||||
gpu_count);
|
||||
// interesting_remainder2
|
||||
left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
|
||||
gpu_count);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
@@ -417,10 +396,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// but in that position, interesting_remainder2 always has a 0
|
||||
auto &merged_interesting_remainder = interesting_remainder1;
|
||||
|
||||
host_addition(streams[0], gpu_indexes[0], merged_interesting_remainder.data,
|
||||
merged_interesting_remainder.data,
|
||||
interesting_remainder2.data, radix_params.big_lwe_dimension,
|
||||
merged_interesting_remainder.len);
|
||||
host_addition<Torus>(
|
||||
streams[0], gpu_indexes[0], merged_interesting_remainder.data,
|
||||
merged_interesting_remainder.data, interesting_remainder2.data,
|
||||
radix_params.big_lwe_dimension, merged_interesting_remainder.len);
|
||||
|
||||
// after create_clean_version_of_merged_remainder
|
||||
// `merged_interesting_remainder` will be reused as
|
||||
@@ -460,7 +439,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// We could call unchecked_scalar_ne
|
||||
// But we are in the special case where scalar == 0
|
||||
// So we can skip some stuff
|
||||
host_compare_with_zero_equality(
|
||||
host_compare_with_zero_equality<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_1.data, trivial_blocks.data,
|
||||
mem_ptr->comparison_buffer, bsks, ksks, trivial_blocks.len,
|
||||
mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
|
||||
@@ -468,7 +447,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
tmp_1.len =
|
||||
ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1);
|
||||
|
||||
is_at_least_one_comparisons_block_true(
|
||||
is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
at_least_one_upper_block_is_non_zero.data, tmp_1.data,
|
||||
mem_ptr->comparison_buffer, bsks, ksks, tmp_1.len);
|
||||
@@ -481,7 +460,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
// `cleaned_merged_interesting_remainder` - radix ciphertext
|
||||
auto create_clean_version_of_merged_remainder =
|
||||
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
cleaned_merged_interesting_remainder.data,
|
||||
cleaned_merged_interesting_remainder.data, bsks, ksks,
|
||||
@@ -493,37 +472,24 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
#pragma omp section
|
||||
{
|
||||
// new_remainder
|
||||
// subtraction_overflowed
|
||||
do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// at_least_one_upper_block_is_non_zero
|
||||
check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes,
|
||||
gpu_count);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// cleaned_merged_interesting_remainder
|
||||
create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
|
||||
gpu_indexes, gpu_count);
|
||||
}
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
// new_remainder
|
||||
// subtraction_overflowed
|
||||
do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
|
||||
// at_least_one_upper_block_is_non_zero
|
||||
check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
|
||||
// cleaned_merged_interesting_remainder
|
||||
create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
|
||||
gpu_indexes, gpu_count);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
host_addition(streams[0], gpu_indexes[0], overflow_sum.data,
|
||||
subtraction_overflowed.data,
|
||||
at_least_one_upper_block_is_non_zero.data,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum.data,
|
||||
subtraction_overflowed.data,
|
||||
at_least_one_upper_block_is_non_zero.data,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
|
||||
int factor = (i) ? 3 : 2;
|
||||
int factor_lut_id = factor - 2;
|
||||
@@ -562,36 +528,24 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
mem_ptr->merge_overflow_flags_luts[pos_in_block]
|
||||
->params.message_modulus);
|
||||
|
||||
host_addition(streams[0], gpu_indexes[0],
|
||||
"ient[block_of_bit * big_lwe_size],
|
||||
"ient[block_of_bit * big_lwe_size],
|
||||
did_not_overflow.data, radix_params.big_lwe_dimension, 1);
|
||||
host_addition<Torus>(
|
||||
streams[0], gpu_indexes[0], "ient[block_of_bit * big_lwe_size],
|
||||
"ient[block_of_bit * big_lwe_size], did_not_overflow.data,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
};
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
#pragma omp section
|
||||
{
|
||||
// cleaned_merged_interesting_remainder
|
||||
conditionally_zero_out_merged_interesting_remainder(
|
||||
mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// new_remainder
|
||||
conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2,
|
||||
gpu_indexes, gpu_count);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
// quotient
|
||||
set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
|
||||
}
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
// cleaned_merged_interesting_remainder
|
||||
conditionally_zero_out_merged_interesting_remainder(mem_ptr->sub_streams_1,
|
||||
gpu_indexes, gpu_count);
|
||||
// new_remainder
|
||||
conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2,
|
||||
gpu_indexes, gpu_count);
|
||||
// quotient
|
||||
set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
@@ -610,29 +564,20 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
// Clean the quotient and remainder
|
||||
// as even though they have no carries, they are not at nominal noise level
|
||||
host_addition(streams[0], gpu_indexes[0], remainder, remainder1.data,
|
||||
remainder2.data, radix_params.big_lwe_dimension,
|
||||
remainder1.len);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], remainder, remainder1.data,
|
||||
remainder2.data, radix_params.big_lwe_dimension,
|
||||
remainder1.len);
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
#pragma omp section
|
||||
{
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
|
||||
bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient,
|
||||
bsks, ksks, num_blocks, mem_ptr->message_extract_lut_2);
|
||||
}
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
|
||||
bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
|
||||
ksks, num_blocks, mem_ptr->message_extract_lut_2);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
@@ -19,9 +19,8 @@ void scratch_cuda_full_propagation_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
@@ -29,8 +28,7 @@ void scratch_cuda_full_propagation_64(
|
||||
|
||||
scratch_cuda_full_propagation<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count,
|
||||
(int_fullprop_buffer<uint64_t> **)mem_ptr, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
(int_fullprop_buffer<uint64_t> **)mem_ptr, params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cleanup_cuda_full_propagation(void **streams, uint32_t *gpu_indexes,
|
||||
@@ -55,7 +53,7 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_propagate_single_carry_kb_inplace(
|
||||
scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
@@ -133,6 +131,19 @@ void cleanup_cuda_apply_univariate_lut_kb_64(void **streams,
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
void cuda_apply_many_univariate_lut_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
|
||||
void **bsks, uint32_t num_blocks, uint32_t lut_count, uint32_t lut_stride) {
|
||||
|
||||
host_apply_many_univariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(output_radix_lwe),
|
||||
static_cast<uint64_t *>(input_radix_lwe),
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
|
||||
lut_count, lut_stride);
|
||||
}
|
||||
|
||||
void scratch_cuda_apply_bivariate_lut_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
@@ -175,3 +186,55 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams,
|
||||
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
|
||||
num_radix_blocks, params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
|
||||
void **ksks, void **bsks, uint32_t num_blocks, uint32_t shift) {
|
||||
|
||||
int_radix_params params = ((int_radix_lut<uint64_t> *)mem_ptr)->params;
|
||||
|
||||
host_compute_prefix_sum_hillis_steele<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(output_radix_lwe),
|
||||
static_cast<uint64_t *>(generates_or_propagates), params,
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
void cuda_integer_reverse_blocks_64_inplace(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t num_blocks,
|
||||
uint32_t lwe_size) {
|
||||
|
||||
host_radix_blocks_reverse_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes,
|
||||
static_cast<uint64_t *>(lwe_array), num_blocks, lwe_size);
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "helper_multi_gpu.h"
|
||||
#include "integer.h"
|
||||
#include "integer/scalar_addition.cuh"
|
||||
#include "linear_algebra.h"
|
||||
@@ -10,6 +11,7 @@
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/helper_multi_gpu.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <functional>
|
||||
|
||||
@@ -20,18 +22,19 @@ template <typename Torus>
|
||||
__global__ void radix_blocks_rotate_right(Torus *dst, Torus *src,
|
||||
uint32_t value, uint32_t blocks_count,
|
||||
uint32_t lwe_size) {
|
||||
value %= blocks_count;
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
size_t src_block_id = blockIdx.x;
|
||||
size_t dst_block_id = (src_block_id + value) % blocks_count;
|
||||
size_t stride = blockDim.x;
|
||||
if (tid < lwe_size) {
|
||||
value %= blocks_count;
|
||||
size_t src_block_id = blockIdx.x;
|
||||
size_t dst_block_id = (src_block_id + value) % blocks_count;
|
||||
size_t stride = blockDim.x;
|
||||
|
||||
auto cur_src_block = &src[src_block_id * lwe_size];
|
||||
auto cur_dst_block = &dst[dst_block_id * lwe_size];
|
||||
auto cur_src_block = &src[src_block_id * lwe_size];
|
||||
auto cur_dst_block = &dst[dst_block_id * lwe_size];
|
||||
|
||||
for (size_t i = tid; i < lwe_size; i += stride) {
|
||||
cur_dst_block[i] = cur_src_block[i];
|
||||
for (size_t i = tid; i < lwe_size; i += stride) {
|
||||
cur_dst_block[i] = cur_src_block[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,25 +45,28 @@ template <typename Torus>
|
||||
__global__ void radix_blocks_rotate_left(Torus *dst, Torus *src, uint32_t value,
|
||||
uint32_t blocks_count,
|
||||
uint32_t lwe_size) {
|
||||
value %= blocks_count;
|
||||
size_t src_block_id = blockIdx.x;
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
size_t dst_block_id = (src_block_id >= value)
|
||||
? src_block_id - value
|
||||
: src_block_id - value + blocks_count;
|
||||
size_t stride = blockDim.x;
|
||||
if (tid < lwe_size) {
|
||||
value %= blocks_count;
|
||||
size_t src_block_id = blockIdx.x;
|
||||
|
||||
auto cur_src_block = &src[src_block_id * lwe_size];
|
||||
auto cur_dst_block = &dst[dst_block_id * lwe_size];
|
||||
size_t dst_block_id = (src_block_id >= value)
|
||||
? src_block_id - value
|
||||
: src_block_id - value + blocks_count;
|
||||
size_t stride = blockDim.x;
|
||||
|
||||
for (size_t i = tid; i < lwe_size; i += stride) {
|
||||
cur_dst_block[i] = cur_src_block[i];
|
||||
auto cur_src_block = &src[src_block_id * lwe_size];
|
||||
auto cur_dst_block = &dst[dst_block_id * lwe_size];
|
||||
|
||||
for (size_t i = tid; i < lwe_size; i += stride) {
|
||||
cur_dst_block[i] = cur_src_block[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// rotate radix ciphertext right with specific value
|
||||
// calculation is not inplace, so `dst` and `src` must not be the same
|
||||
// one block is responsible to process single lwe ciphertext
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
@@ -72,7 +78,7 @@ host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
"pointers should be different");
|
||||
}
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
radix_blocks_rotate_right<<<blocks_count, 1024, 0, streams[0]>>>(
|
||||
radix_blocks_rotate_right<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
|
||||
dst, src, value, blocks_count, lwe_size);
|
||||
}
|
||||
|
||||
@@ -89,10 +95,39 @@ host_radix_blocks_rotate_left(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
"pointers should be different");
|
||||
}
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
radix_blocks_rotate_left<<<blocks_count, 1024, 0, streams[0]>>>(
|
||||
radix_blocks_rotate_left<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
|
||||
dst, src, value, blocks_count, lwe_size);
|
||||
}
|
||||
|
||||
// reverse the blocks in a list
|
||||
// each cuda block swaps a couple of blocks
|
||||
template <typename Torus>
|
||||
__global__ void radix_blocks_reverse_lwe_inplace(Torus *src,
|
||||
uint32_t blocks_count,
|
||||
uint32_t lwe_size) {
|
||||
|
||||
size_t idx = blockIdx.x;
|
||||
size_t rev_idx = blocks_count - 1 - idx;
|
||||
|
||||
for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
|
||||
Torus back_element = src[rev_idx * lwe_size + j];
|
||||
Torus front_element = src[idx * lwe_size + j];
|
||||
src[idx * lwe_size + j] = back_element;
|
||||
src[rev_idx * lwe_size + j] = front_element;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_radix_blocks_reverse_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
Torus *src, uint32_t blocks_count,
|
||||
uint32_t lwe_size) {
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
int num_blocks = blocks_count / 2, num_threads = 1024;
|
||||
radix_blocks_reverse_lwe_inplace<Torus>
|
||||
<<<num_blocks, num_threads, 0, streams[0]>>>(src, blocks_count, lwe_size);
|
||||
}
|
||||
|
||||
// polynomial_size threads
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
@@ -129,9 +164,10 @@ __host__ void pack_bivariate_blocks(cudaStream_t *streams,
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = num_radix_blocks * (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
|
||||
lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2, lwe_indexes_in,
|
||||
lwe_dimension, shift, num_radix_blocks);
|
||||
device_pack_bivariate_blocks<Torus>
|
||||
<<<num_blocks, num_threads, 0, streams[0]>>>(
|
||||
lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2,
|
||||
lwe_indexes_in, lwe_dimension, shift, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -153,28 +189,157 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto grouping_factor = params.grouping_factor;
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count,
|
||||
lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes,
|
||||
lwe_array_in, lut->lwe_indexes_in, ksks,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_base_log,
|
||||
ks_level, num_radix_blocks, false);
|
||||
// In the case of extracting a single LWE this parameters are dummy
|
||||
uint32_t lut_count = 1;
|
||||
uint32_t lut_stride = 0;
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
|
||||
std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
|
||||
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lut->lwe_indexes_out,
|
||||
lut->lut_vec, lut->lut_indexes_vec, lut->tmp_lwe_after_ks,
|
||||
lut->lwe_trivial_indexes, bsks, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, 1, 0,
|
||||
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
if (active_gpu_count == 1) {
|
||||
execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], lwe_array_in,
|
||||
lut->lwe_indexes_in, ksks, big_lwe_dimension,
|
||||
small_lwe_dimension, ks_base_log, ks_level,
|
||||
num_radix_blocks);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
|
||||
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride,
|
||||
true);
|
||||
} else {
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
/// With multiple GPUs we push to the vectors on each GPU then when we
|
||||
/// gather data to GPU 0 we can copy back to the original indexing
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_in,
|
||||
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
|
||||
big_lwe_dimension + 1);
|
||||
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec,
|
||||
lwe_array_in_vec, lwe_trivial_indexes_vec,
|
||||
ksks, big_lwe_dimension, small_lwe_dimension,
|
||||
ks_base_log, ks_level, num_radix_blocks);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
|
||||
lut_stride, true);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_array_out, lwe_after_pbs_vec,
|
||||
lut->h_lwe_indexes_out,
|
||||
lut->using_trivial_lwe_indexes,
|
||||
num_radix_blocks, big_lwe_dimension + 1);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, void **bsks, Torus **ksks,
|
||||
uint32_t num_radix_blocks, int_radix_lut<Torus> *lut, uint32_t lut_count,
|
||||
uint32_t lut_stride) {
|
||||
// apply_lookup_table
|
||||
auto params = lut->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto small_lwe_dimension = params.small_lwe_dimension;
|
||||
auto ks_level = params.ks_level;
|
||||
auto ks_base_log = params.ks_base_log;
|
||||
auto pbs_level = params.pbs_level;
|
||||
auto pbs_base_log = params.pbs_base_log;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto grouping_factor = params.grouping_factor;
|
||||
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
|
||||
std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
|
||||
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
if (active_gpu_count == 1) {
|
||||
execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], lwe_array_in,
|
||||
lut->lwe_indexes_in, ksks, big_lwe_dimension,
|
||||
small_lwe_dimension, ks_base_log, ks_level,
|
||||
num_radix_blocks);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
|
||||
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride,
|
||||
true);
|
||||
} else {
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
/// With multiple GPUs we push to the vectors on each GPU then when we
|
||||
/// gather data to GPU 0 we can copy back to the original indexing
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_in,
|
||||
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
|
||||
big_lwe_dimension + 1);
|
||||
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec,
|
||||
lwe_array_in_vec, lwe_trivial_indexes_vec,
|
||||
ksks, big_lwe_dimension, small_lwe_dimension,
|
||||
ks_base_log, ks_level, num_radix_blocks);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
|
||||
lut_stride, true);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_array_out, lwe_after_pbs_vec,
|
||||
lut->h_lwe_indexes_out,
|
||||
lut->using_trivial_lwe_indexes,
|
||||
num_radix_blocks, big_lwe_dimension + 1);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -197,37 +362,77 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto grouping_factor = params.grouping_factor;
|
||||
|
||||
// In the case of extracting a single LWE this parameters are dummy
|
||||
uint32_t lut_count = 1;
|
||||
uint32_t lut_stride = 0;
|
||||
|
||||
// Left message is shifted
|
||||
auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
|
||||
pack_bivariate_blocks(streams, gpu_indexes, gpu_count, lwe_array_pbs_in,
|
||||
lut->lwe_trivial_indexes, lwe_array_1, lwe_array_2,
|
||||
lut->lwe_indexes_in, big_lwe_dimension, shift,
|
||||
num_radix_blocks);
|
||||
pack_bivariate_blocks<Torus>(streams, gpu_indexes, gpu_count,
|
||||
lwe_array_pbs_in, lut->lwe_trivial_indexes,
|
||||
lwe_array_1, lwe_array_2, lut->lwe_indexes_in,
|
||||
big_lwe_dimension, shift, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
|
||||
std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
|
||||
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
|
||||
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count,
|
||||
lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes,
|
||||
lwe_array_pbs_in, lut->lwe_indexes_in, ksks,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_base_log,
|
||||
ks_level, num_radix_blocks, false);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lut->lwe_indexes_out,
|
||||
lut->lut_vec, lut->lut_indexes_vec, lut->tmp_lwe_after_ks,
|
||||
lut->lwe_trivial_indexes, bsks, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, 1, 0,
|
||||
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
if (active_gpu_count == 1) {
|
||||
execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], lwe_array_pbs_in,
|
||||
lut->lwe_indexes_in, ksks, big_lwe_dimension,
|
||||
small_lwe_dimension, ks_base_log, ks_level,
|
||||
num_radix_blocks);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
|
||||
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride,
|
||||
true);
|
||||
} else {
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
|
||||
lwe_array_pbs_in, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
|
||||
num_radix_blocks, big_lwe_dimension + 1);
|
||||
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec,
|
||||
lwe_array_in_vec, lwe_trivial_indexes_vec,
|
||||
ksks, big_lwe_dimension, small_lwe_dimension,
|
||||
ks_base_log, ks_level, num_radix_blocks);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
|
||||
lut_stride, true);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_array_out, lwe_after_pbs_vec,
|
||||
lut->h_lwe_indexes_out,
|
||||
lut->using_trivial_lwe_indexes,
|
||||
num_radix_blocks, big_lwe_dimension + 1);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -272,7 +477,7 @@ void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
|
||||
body[i] = -body[i];
|
||||
}
|
||||
|
||||
rotate_left(body, half_box_size, polynomial_size);
|
||||
rotate_left<Torus>(body, half_box_size, polynomial_size);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -325,7 +530,6 @@ void generate_device_accumulator_bivariate(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, std::function<Torus(Torus, Torus)> f) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
@@ -357,7 +561,6 @@ void generate_device_accumulator_bivariate_with_factor(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, std::function<Torus(Torus, Torus)> f, int factor) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
@@ -367,6 +570,7 @@ void generate_device_accumulator_bivariate_with_factor(
|
||||
h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f,
|
||||
factor);
|
||||
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
// copy host lut and lut_indexes_vec to device
|
||||
cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
|
||||
(glwe_dimension + 1) * polynomial_size *
|
||||
@@ -392,7 +596,6 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t carry_modulus,
|
||||
std::function<Torus(Torus)> f) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
@@ -421,6 +624,43 @@ void scratch_cuda_propagate_single_carry_kb_inplace(
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_compute_prefix_sum_hillis_steele(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *step_output, Torus *generates_or_propagates, int_radix_params params,
|
||||
int_radix_lut<Torus> *luts, void **bsks, Torus **ksks,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto big_lwe_size = glwe_dimension * polynomial_size + 1;
|
||||
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
int num_steps = ceil(log2((double)num_blocks));
|
||||
int space = 1;
|
||||
cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
|
||||
big_lwe_size_bytes * num_blocks, streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
for (int step = 0; step < num_steps; step++) {
|
||||
if (space > num_blocks - 1)
|
||||
PANIC("Cuda error: step output is going out of bounds in Hillis Steele "
|
||||
"propagation")
|
||||
auto cur_blocks = &step_output[space * big_lwe_size];
|
||||
auto prev_blocks = generates_or_propagates;
|
||||
int cur_total_blocks = num_blocks - space;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
|
||||
bsks, ksks, cur_total_blocks, luts, luts->params.message_modulus);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
&generates_or_propagates[space * big_lwe_size], cur_blocks,
|
||||
big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
|
||||
space *= 2;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array,
|
||||
@@ -445,33 +685,13 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
ksks, num_blocks, luts_array);
|
||||
|
||||
// compute prefix sum with hillis&steele
|
||||
host_compute_prefix_sum_hillis_steele<Torus>(
|
||||
streams, gpu_indexes, gpu_count, step_output, generates_or_propagates,
|
||||
params, luts_carry_propagation_sum, bsks, ksks, num_blocks);
|
||||
|
||||
int num_steps = ceil(log2((double)num_blocks));
|
||||
int space = 1;
|
||||
cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
|
||||
big_lwe_size_bytes * num_blocks, streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
for (int step = 0; step < num_steps; step++) {
|
||||
auto cur_blocks = &step_output[space * big_lwe_size];
|
||||
auto prev_blocks = generates_or_propagates;
|
||||
int cur_total_blocks = num_blocks - space;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
|
||||
bsks, ksks, cur_total_blocks, luts_carry_propagation_sum,
|
||||
luts_carry_propagation_sum->params.message_modulus);
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
&generates_or_propagates[space * big_lwe_size], cur_blocks,
|
||||
big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
|
||||
space *= 2;
|
||||
}
|
||||
|
||||
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
|
||||
generates_or_propagates, 1, num_blocks,
|
||||
big_lwe_size);
|
||||
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
|
||||
step_output, generates_or_propagates, 1,
|
||||
num_blocks, big_lwe_size);
|
||||
if (carry_out != nullptr) {
|
||||
cuda_memcpy_async_gpu_to_gpu(carry_out, step_output, big_lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
@@ -485,8 +705,9 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
gpu_indexes[0]);
|
||||
}
|
||||
|
||||
host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, step_output,
|
||||
glwe_dimension * polynomial_size, num_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
|
||||
step_output, glwe_dimension * polynomial_size,
|
||||
num_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
|
||||
@@ -531,40 +752,23 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
|
||||
ksks, num_blocks, luts_array);
|
||||
|
||||
// compute prefix sum with hillis&steele
|
||||
int num_steps = ceil(log2((double)num_blocks));
|
||||
int space = 1;
|
||||
cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
|
||||
big_lwe_size_bytes * num_blocks, streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
for (int step = 0; step < num_steps; step++) {
|
||||
auto cur_blocks = &step_output[space * big_lwe_size];
|
||||
auto prev_blocks = generates_or_propagates;
|
||||
int cur_total_blocks = num_blocks - space;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
|
||||
bsks, ksks, cur_total_blocks, luts_carry_propagation_sum,
|
||||
luts_carry_propagation_sum->params.message_modulus);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
&generates_or_propagates[space * big_lwe_size], cur_blocks,
|
||||
big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
|
||||
space *= 2;
|
||||
}
|
||||
host_compute_prefix_sum_hillis_steele<Torus>(
|
||||
streams, gpu_indexes, gpu_count, step_output, generates_or_propagates,
|
||||
params, luts_carry_propagation_sum, bsks, ksks, num_blocks);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
overflowed, &generates_or_propagates[big_lwe_size * (num_blocks - 1)],
|
||||
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
|
||||
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
|
||||
generates_or_propagates, 1, num_blocks,
|
||||
big_lwe_size);
|
||||
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
|
||||
step_output, generates_or_propagates, 1,
|
||||
num_blocks, big_lwe_size);
|
||||
cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
host_subtraction(streams[0], gpu_indexes[0], lwe_array, lwe_array,
|
||||
step_output, glwe_dimension * polynomial_size, num_blocks);
|
||||
host_subtraction<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
|
||||
step_output, glwe_dimension * polynomial_size,
|
||||
num_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
|
||||
@@ -590,15 +794,17 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
int big_lwe_size = (params.glwe_dimension * params.polynomial_size + 1);
|
||||
int small_lwe_size = (params.small_lwe_dimension + 1);
|
||||
|
||||
// In the case of extracting a single LWE this parameters are dummy
|
||||
uint32_t lut_count = 1;
|
||||
uint32_t lut_stride = 0;
|
||||
for (int i = 0; i < num_blocks; i++) {
|
||||
auto cur_input_block = &input_blocks[i * big_lwe_size];
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
/// Since the keyswitch is done on one input only, use only 1 GPU
|
||||
cuda_keyswitch_lwe_ciphertext_vector<Torus>(
|
||||
streams[0], gpu_indexes[0], mem_ptr->tmp_small_lwe_vector,
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, 1, mem_ptr->tmp_small_lwe_vector,
|
||||
mem_ptr->lut->lwe_trivial_indexes, cur_input_block,
|
||||
mem_ptr->lut->lwe_trivial_indexes, ksks[0], params.big_lwe_dimension,
|
||||
mem_ptr->lut->lwe_trivial_indexes, ksks, params.big_lwe_dimension,
|
||||
params.small_lwe_dimension, params.ks_base_log, params.ks_level, 1);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size],
|
||||
@@ -606,15 +812,15 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
small_lwe_size * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
execute_pbs<Torus>(
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, mem_ptr->tmp_big_lwe_vector,
|
||||
mem_ptr->lut->lwe_trivial_indexes, mem_ptr->lut->lut_vec,
|
||||
mem_ptr->lut->lut_indexes_vec, mem_ptr->tmp_small_lwe_vector,
|
||||
mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
|
||||
params.glwe_dimension, params.small_lwe_dimension,
|
||||
params.polynomial_size, params.pbs_base_log, params.pbs_level,
|
||||
params.grouping_factor, 2, 2, 0,
|
||||
cuda_get_max_shared_memory(gpu_indexes[0]), params.pbs_type);
|
||||
params.grouping_factor, 2, params.pbs_type, lut_count, lut_stride,
|
||||
true);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
|
||||
big_lwe_size * sizeof(Torus), streams[0],
|
||||
@@ -622,10 +828,10 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
|
||||
if (i < num_blocks - 1) {
|
||||
auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
|
||||
host_addition(streams[0], gpu_indexes[0], next_input_block,
|
||||
next_input_block,
|
||||
&mem_ptr->tmp_big_lwe_vector[big_lwe_size],
|
||||
params.big_lwe_dimension, 1);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], next_input_block,
|
||||
next_input_block,
|
||||
&mem_ptr->tmp_big_lwe_vector[big_lwe_size],
|
||||
params.big_lwe_dimension, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -635,12 +841,10 @@ void scratch_cuda_full_propagation(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int_fullprop_buffer<Torus> **mem_ptr,
|
||||
int_radix_params params,
|
||||
uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr =
|
||||
new int_fullprop_buffer<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
*mem_ptr = new int_fullprop_buffer<Torus>(streams, gpu_indexes, gpu_count,
|
||||
params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
// (lwe_dimension+1) threads
|
||||
@@ -662,7 +866,7 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
}
|
||||
|
||||
if (num_radix_blocks % 2 == 1) {
|
||||
// We couldn't pack the last block, so we just copy it
|
||||
// We couldn't host_pack the last block, so we just copy it
|
||||
Torus *lsb_block =
|
||||
lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
|
||||
Torus *last_block =
|
||||
@@ -685,12 +889,13 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
uint32_t lwe_dimension, uint32_t num_radix_blocks,
|
||||
uint32_t factor) {
|
||||
if (num_radix_blocks == 0)
|
||||
return;
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads);
|
||||
device_pack_blocks<<<num_blocks, num_threads, 0, stream>>>(
|
||||
device_pack_blocks<Torus><<<num_blocks, num_threads, 0, stream>>>(
|
||||
lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor);
|
||||
}
|
||||
|
||||
@@ -736,7 +941,7 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_create_trivial_radix<<<grid, thds, 0, stream>>>(
|
||||
device_create_trivial_radix<Torus><<<grid, thds, 0, stream>>>(
|
||||
lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -753,7 +958,7 @@ __host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t num_radix_blocks, uint32_t bits_per_block,
|
||||
int_bit_extract_luts_buffer<Torus> *bit_extract) {
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks, ksks,
|
||||
num_radix_blocks * bits_per_block, bit_extract->lut);
|
||||
}
|
||||
@@ -766,7 +971,6 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
std::function<Torus(Torus)> sign_handler_f, void **bsks,
|
||||
Torus **ksks, uint32_t num_sign_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto diff_buffer = mem_ptr->diff_buffer;
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
@@ -800,9 +1004,9 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
while (num_sign_blocks > 2) {
|
||||
pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a,
|
||||
big_lwe_dimension, num_sign_blocks, 4);
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
|
||||
big_lwe_dimension, num_sign_blocks, 4);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, signs_a, signs_b, bsks, ksks,
|
||||
num_sign_blocks / 2, lut);
|
||||
|
||||
@@ -833,11 +1037,11 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
final_lut_f);
|
||||
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, big_lwe_dimension,
|
||||
2, 4);
|
||||
integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
|
||||
gpu_count, signs_array_out,
|
||||
signs_b, bsks, ksks, 1, lut);
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
|
||||
big_lwe_dimension, 2, 4);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, signs_array_out, signs_b, bsks, ksks,
|
||||
1, lut);
|
||||
|
||||
} else {
|
||||
|
||||
@@ -853,9 +1057,9 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
final_lut_f);
|
||||
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
|
||||
gpu_count, signs_array_out,
|
||||
signs_a, bsks, ksks, 1, lut);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks,
|
||||
1, lut);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -888,6 +1092,18 @@ void host_apply_univariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
num_blocks, mem);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_apply_many_univariate_lut_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *radix_lwe_out, Torus *radix_lwe_in, int_radix_lut<Torus> *mem,
|
||||
Torus **ksks, void **bsks, uint32_t num_blocks, uint32_t lut_count,
|
||||
uint32_t lut_stride) {
|
||||
|
||||
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks,
|
||||
num_blocks, mem, lut_count, lut_stride);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_apply_bivariate_lut_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
|
||||
@@ -71,7 +71,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
|
||||
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
polynomial_size * glwe_dimension, lwe_dimension,
|
||||
@@ -123,7 +123,6 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
* - 'num_blocks' is the number of big lwe ciphertext blocks inside radix
|
||||
* ciphertext
|
||||
* - 'pbs_type' selects which PBS implementation should be used
|
||||
* - 'max_shared_memory' maximum shared memory per cuda block
|
||||
*/
|
||||
void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -133,7 +132,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<256>>(
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
@@ -141,7 +140,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 512:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<512>>(
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
@@ -149,7 +148,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<1024>>(
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
@@ -157,7 +156,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<2048>>(
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
@@ -165,7 +164,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<4096>>(
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
@@ -173,7 +172,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<8192>>(
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
@@ -181,7 +180,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<16384>>(
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
@@ -203,7 +202,7 @@ void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
@@ -216,13 +215,13 @@ void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus);
|
||||
scratch_cuda_integer_sum_ciphertexts_vec_kb<uint64_t>(
|
||||
scratch_cuda_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
|
||||
max_num_radix_in_vec, params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
|
||||
int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix) {
|
||||
@@ -238,46 +237,57 @@ void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
|
||||
switch (mem->params.polynomial_size) {
|
||||
case 512:
|
||||
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
|
||||
nullptr);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<1024>>(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
|
||||
nullptr);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<2048>>(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
|
||||
nullptr);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<4096>>(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
|
||||
nullptr);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<8192>>(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
|
||||
nullptr);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<16384>>(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
|
||||
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
|
||||
nullptr);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
@@ -287,10 +297,9 @@ void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
|
||||
free(terms_degree);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_sum_ciphertexts_vec(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr =
|
||||
(int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
|
||||
@@ -8,11 +8,13 @@
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "helper_multi_gpu.h"
|
||||
#include "integer.h"
|
||||
#include "integer/integer.cuh"
|
||||
#include "linear_algebra.h"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/helper_multi_gpu.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
@@ -91,15 +93,11 @@ all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, sharedMemDegree SMD>
|
||||
template <typename Torus>
|
||||
__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
|
||||
uint32_t chunk_size, uint32_t block_size,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
|
||||
Torus *result = (Torus *)sharedmem;
|
||||
|
||||
size_t stride = blockDim.x;
|
||||
size_t chunk_id = blockIdx.x;
|
||||
size_t chunk_elem_size = chunk_size * num_blocks * block_size;
|
||||
@@ -107,10 +105,7 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
|
||||
auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
|
||||
auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
|
||||
size_t block_stride = blockIdx.y * block_size;
|
||||
auto dst_block = &dst_radix[block_stride];
|
||||
|
||||
if constexpr (SMD == NOSM)
|
||||
result = dst_block;
|
||||
auto result = &dst_radix[block_stride];
|
||||
|
||||
// init shared mem with first radix of chunk
|
||||
size_t tid = threadIdx.x;
|
||||
@@ -125,18 +120,12 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
|
||||
result[i] += cur_src_radix[block_stride + i];
|
||||
}
|
||||
}
|
||||
|
||||
// put result from shared mem to global mem
|
||||
if constexpr (SMD == FULLSM)
|
||||
for (int i = tid; i < block_size; i += stride)
|
||||
dst_block[i] = result[i];
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
|
||||
Torus *msb_blocks,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t lsb_count, uint32_t msb_count,
|
||||
uint32_t num_blocks) {
|
||||
size_t big_lwe_dimension = glwe_dimension * params::degree + 1;
|
||||
size_t big_lwe_id = blockIdx.x;
|
||||
@@ -180,41 +169,27 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
|
||||
}
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_sum_ciphertexts_vec_kb(
|
||||
__host__ void scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
|
||||
if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
tree_add_chunks<Torus, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(
|
||||
cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
*mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks_in_radix,
|
||||
max_num_radix_in_vec, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *radix_lwe_out, Torus *terms, int *terms_degree, void **bsks,
|
||||
uint64_t **ksks, int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
|
||||
uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec,
|
||||
int_radix_lut<Torus> *reused_lut = nullptr) {
|
||||
int_radix_lut<Torus> *reused_lut) {
|
||||
|
||||
auto new_blocks = mem_ptr->new_blocks;
|
||||
auto new_blocks_copy = mem_ptr->new_blocks_copy;
|
||||
auto old_blocks = mem_ptr->old_blocks;
|
||||
auto small_lwe_vector = mem_ptr->small_lwe_vector;
|
||||
|
||||
@@ -224,18 +199,38 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
auto message_modulus = mem_ptr->params.message_modulus;
|
||||
auto carry_modulus = mem_ptr->params.carry_modulus;
|
||||
auto num_blocks = num_blocks_in_radix;
|
||||
auto big_lwe_size = mem_ptr->params.big_lwe_dimension + 1;
|
||||
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
|
||||
auto big_lwe_size = big_lwe_dimension + 1;
|
||||
auto glwe_dimension = mem_ptr->params.glwe_dimension;
|
||||
auto polynomial_size = mem_ptr->params.polynomial_size;
|
||||
auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
|
||||
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
|
||||
auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
|
||||
auto small_lwe_size = small_lwe_dimension + 1;
|
||||
|
||||
// In the case of extracting a single LWE this parameters are dummy
|
||||
uint32_t lut_count = 1;
|
||||
uint32_t lut_stride = 0;
|
||||
|
||||
if (num_radix_in_vec == 0)
|
||||
return;
|
||||
if (num_radix_in_vec == 1) {
|
||||
cuda_memcpy_async_gpu_to_gpu(radix_lwe_out, terms,
|
||||
num_blocks_in_radix * big_lwe_size *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
return;
|
||||
}
|
||||
if (old_blocks != terms) {
|
||||
cuda_memcpy_async_gpu_to_gpu(old_blocks, terms,
|
||||
num_blocks_in_radix * num_radix_in_vec *
|
||||
big_lwe_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
}
|
||||
if (num_radix_in_vec == 2) {
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
|
||||
&old_blocks[num_blocks * big_lwe_size],
|
||||
big_lwe_dimension, num_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
size_t r = num_radix_in_vec;
|
||||
size_t total_modulus = message_modulus * carry_modulus;
|
||||
@@ -247,22 +242,29 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
int32_t h_smart_copy_in[r * num_blocks];
|
||||
int32_t h_smart_copy_out[r * num_blocks];
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_indexes[0]);
|
||||
/// Here it is important to query the default max shared memory on device 0
|
||||
/// instead of cuda_get_max_shared_memory,
|
||||
/// to avoid bugs with tree_add_chunks trying to use too much shared memory
|
||||
int max_shared_memory = 0;
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
|
||||
|
||||
// create lut object for message and carry
|
||||
// we allocate luts_message_carry in the host function (instead of scratch)
|
||||
// to reduce average memory consumption
|
||||
bool release_reused_lut = false;
|
||||
int_radix_lut<Torus> *luts_message_carry;
|
||||
size_t ch_amount = r / chunk_size;
|
||||
if (!ch_amount)
|
||||
ch_amount++;
|
||||
if (reused_lut == nullptr) {
|
||||
release_reused_lut = true;
|
||||
size_t ch_amount = r / chunk_size;
|
||||
if (!ch_amount)
|
||||
ch_amount++;
|
||||
reused_lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count,
|
||||
mem_ptr->params, 2,
|
||||
2 * ch_amount * num_blocks, true);
|
||||
luts_message_carry = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
|
||||
2 * ch_amount * num_blocks, true);
|
||||
} else {
|
||||
luts_message_carry = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
|
||||
2 * ch_amount * num_blocks, reused_lut);
|
||||
}
|
||||
int_radix_lut<Torus> *luts_message_carry = reused_lut;
|
||||
auto message_acc = luts_message_carry->get_lut(gpu_indexes[0], 0);
|
||||
auto carry_acc = luts_message_carry->get_lut(gpu_indexes[0], 1);
|
||||
|
||||
@@ -289,15 +291,10 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
if (!ch_amount)
|
||||
ch_amount++;
|
||||
dim3 add_grid(ch_amount, num_blocks, 1);
|
||||
size_t sm_size = big_lwe_size * sizeof(Torus);
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
if (sm_size < max_shared_memory)
|
||||
tree_add_chunks<Torus, FULLSM><<<add_grid, 512, sm_size, streams[0]>>>(
|
||||
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
|
||||
else
|
||||
tree_add_chunks<Torus, NOSM><<<add_grid, 512, 0, streams[0]>>>(
|
||||
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
|
||||
tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
|
||||
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
|
||||
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
@@ -310,23 +307,25 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
terms_degree, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
|
||||
h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max,
|
||||
total_count, message_count, carry_count, sm_copy_count);
|
||||
|
||||
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
|
||||
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
|
||||
luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
|
||||
h_lwe_idx_in, h_lwe_idx_out);
|
||||
|
||||
size_t copy_size = total_count * sizeof(Torus);
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_idx_in, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_idx_out, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
copy_size = sm_copy_count * sizeof(int32_t);
|
||||
size_t copy_size = sm_copy_count * sizeof(int32_t);
|
||||
cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
smart_copy<<<sm_copy_count, 1024, 0, streams[0]>>>(
|
||||
new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
|
||||
// inside d_smart_copy_in there are only -1 values
|
||||
// it's fine to call smart_copy with same pointer
|
||||
// as source and destination
|
||||
cuda_memcpy_async_gpu_to_gpu(new_blocks_copy, new_blocks,
|
||||
r * num_blocks * big_lwe_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
|
||||
new_blocks, new_blocks_copy, d_smart_copy_out, d_smart_copy_in,
|
||||
big_lwe_size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
@@ -338,27 +337,97 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
/// After this keyswitch execution, we need to synchronize the streams
|
||||
/// because the keyswitch and PBS do not operate on the same number of
|
||||
/// inputs
|
||||
execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, small_lwe_vector,
|
||||
lwe_indexes_in, new_blocks, lwe_indexes_in, ksks,
|
||||
polynomial_size * glwe_dimension, lwe_dimension,
|
||||
mem_ptr->params.ks_base_log,
|
||||
mem_ptr->params.ks_level, message_count, true);
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
|
||||
std::vector<Torus *> small_lwe_vector_vec =
|
||||
luts_message_carry->lwe_after_ks_vec;
|
||||
std::vector<Torus *> lwe_after_pbs_vec =
|
||||
luts_message_carry->lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec =
|
||||
luts_message_carry->lwe_trivial_indexes_vec;
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs<Torus>(streams, gpu_indexes, gpu_count, new_blocks,
|
||||
lwe_indexes_out, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, small_lwe_vector,
|
||||
lwe_indexes_in, bsks, luts_message_carry->buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size,
|
||||
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
|
||||
mem_ptr->params.grouping_factor, total_count, 2, 0,
|
||||
max_shared_memory, mem_ptr->params.pbs_type, true);
|
||||
auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
|
||||
if (active_gpu_count == 1) {
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
/// After this keyswitch execution, we need to synchronize the streams
|
||||
/// because the keyswitch and PBS do not operate on the same number of
|
||||
/// inputs
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, 1, small_lwe_vector, lwe_indexes_in, new_blocks,
|
||||
lwe_indexes_in, ksks, polynomial_size * glwe_dimension,
|
||||
small_lwe_dimension, mem_ptr->params.ks_base_log,
|
||||
mem_ptr->params.ks_level, message_count);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, new_blocks, lwe_indexes_out,
|
||||
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
|
||||
small_lwe_vector, lwe_indexes_in, bsks, luts_message_carry->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size,
|
||||
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
|
||||
mem_ptr->params.grouping_factor, total_count,
|
||||
mem_ptr->params.pbs_type, lut_count, lut_stride, true);
|
||||
} else {
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, new_blocks_vec, new_blocks,
|
||||
luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->using_trivial_lwe_indexes, message_count,
|
||||
big_lwe_size);
|
||||
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
/// After this keyswitch execution, we need to synchronize the streams
|
||||
/// because the keyswitch and PBS do not operate on the same number of
|
||||
/// inputs
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
|
||||
lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
|
||||
ksks, big_lwe_dimension, small_lwe_dimension,
|
||||
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);
|
||||
|
||||
/// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
|
||||
/// different configuration
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, gpu_count, small_lwe_vector,
|
||||
small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->using_trivial_lwe_indexes, message_count,
|
||||
small_lwe_size);
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
|
||||
small_lwe_vector, luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->using_trivial_lwe_indexes, total_count,
|
||||
small_lwe_size);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
|
||||
lwe_trivial_indexes_vec, bsks, luts_message_carry->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size,
|
||||
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
|
||||
mem_ptr->params.grouping_factor, total_count,
|
||||
mem_ptr->params.pbs_type, lut_count, lut_stride, true);
|
||||
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, new_blocks, lwe_after_pbs_vec,
|
||||
luts_message_carry->h_lwe_indexes_out,
|
||||
luts_message_carry->using_trivial_lwe_indexes, total_count,
|
||||
big_lwe_size);
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0;
|
||||
int new_blocks_created = 2 * ch_amount * num_blocks;
|
||||
@@ -371,21 +440,15 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
|
||||
std::swap(new_blocks, old_blocks);
|
||||
r = (new_blocks_created + rem_blocks) / num_blocks;
|
||||
}
|
||||
if (release_reused_lut) {
|
||||
reused_lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete (reused_lut);
|
||||
}
|
||||
luts_message_carry->release(streams, gpu_indexes, gpu_count);
|
||||
delete (luts_message_carry);
|
||||
|
||||
host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
|
||||
&old_blocks[num_blocks * big_lwe_size], big_lwe_dimension,
|
||||
num_blocks);
|
||||
|
||||
host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
|
||||
radix_lwe_out, nullptr, nullptr,
|
||||
mem_ptr->scp_mem, bsks, ksks, num_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
|
||||
&old_blocks[num_blocks * big_lwe_size],
|
||||
big_lwe_dimension, num_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, class params>
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_integer_mult_radix_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
uint64_t *radix_lwe_out, uint64_t *radix_lwe_left,
|
||||
@@ -477,8 +540,7 @@ __host__ void host_integer_mult_radix_kb(
|
||||
fill_radix_from_lsb_msb<Torus, params>
|
||||
<<<num_blocks * num_blocks, params::degree / params::opt, 0,
|
||||
streams[0]>>>(vector_result_sb, vector_result_lsb, vector_result_msb,
|
||||
glwe_dimension, lsb_vector_block_count,
|
||||
msb_vector_block_count, num_blocks);
|
||||
glwe_dimension, num_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
int terms_degree[2 * num_blocks * num_blocks];
|
||||
@@ -494,10 +556,15 @@ __host__ void host_integer_mult_radix_kb(
|
||||
terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
|
||||
}
|
||||
|
||||
host_integer_sum_ciphertexts_vec_kb<Torus, params>(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus, params>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb,
|
||||
terms_degree, bsks, ksks, mem_ptr->sum_ciphertexts_mem, num_blocks,
|
||||
2 * num_blocks, mem_ptr->luts_array);
|
||||
|
||||
auto scp_mem_ptr = mem_ptr->sum_ciphertexts_mem->scp_mem;
|
||||
host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
|
||||
radix_lwe_out, nullptr, nullptr,
|
||||
scp_mem_ptr, bsks, ksks, num_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -505,22 +572,6 @@ __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_mul_memory<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
|
||||
if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
tree_add_chunks<Torus, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(
|
||||
cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
*mem_ptr = new int_mul_memory<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
@@ -1,14 +1,16 @@
|
||||
#include "integer/negation.cuh"
|
||||
|
||||
void cuda_negate_integer_radix_ciphertext_64_inplace(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
|
||||
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
void cuda_negate_integer_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus) {
|
||||
|
||||
host_integer_radix_negation(
|
||||
host_integer_radix_negation<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(lwe_array),
|
||||
lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in), lwe_dimension,
|
||||
lwe_ciphertext_count, message_modulus, carry_modulus);
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_radix_overflowing_sub_kb_64(
|
||||
|
||||
@@ -25,14 +25,13 @@ template <typename Torus>
|
||||
__global__ void
|
||||
device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
|
||||
uint64_t lwe_dimension, uint64_t message_modulus,
|
||||
uint64_t carry_modulus, uint64_t delta) {
|
||||
uint64_t delta) {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < lwe_dimension + 1) {
|
||||
bool is_body = (tid == lwe_dimension);
|
||||
|
||||
// z = ceil( degree / 2^p ) * 2^p
|
||||
uint64_t z = (2 * message_modulus - 1) / message_modulus;
|
||||
__syncthreads();
|
||||
z *= message_modulus;
|
||||
|
||||
// (0,Delta*z) - ct
|
||||
@@ -47,12 +46,9 @@ device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
|
||||
|
||||
uint64_t encoded_zb = zb * delta;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// (0,Delta*z) - ct
|
||||
output[tid] =
|
||||
(is_body ? z * delta - (input[tid] + encoded_zb) : -input[tid]);
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -75,16 +71,15 @@ host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
uint64_t shared_mem = input_lwe_ciphertext_count * sizeof(uint32_t);
|
||||
|
||||
// Value of the shift we multiply our messages by
|
||||
// If message_modulus and carry_modulus are always powers of 2 we can simplify
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_negation<<<grid, thds, shared_mem, streams[0]>>>(
|
||||
device_integer_radix_negation<<<grid, thds, 0, streams[0]>>>(
|
||||
output, input, input_lwe_ciphertext_count, lwe_dimension, message_modulus,
|
||||
carry_modulus, delta);
|
||||
delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -107,7 +102,7 @@ __host__ void host_integer_overflowing_sub_kb(
|
||||
|
||||
auto radix_params = mem_ptr->params;
|
||||
|
||||
host_unchecked_sub_with_correcting_term(
|
||||
host_unchecked_sub_with_correcting_term<Torus>(
|
||||
streams[0], gpu_indexes[0], radix_lwe_out, radix_lwe_left,
|
||||
radix_lwe_right, radix_params.big_lwe_dimension, num_blocks,
|
||||
radix_params.message_modulus, radix_params.carry_modulus,
|
||||
|
||||
@@ -5,7 +5,7 @@ void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
|
||||
void *scalar_input, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
|
||||
host_integer_radix_scalar_addition_inplace(
|
||||
host_integer_radix_scalar_addition_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(scalar_input),
|
||||
lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
|
||||
|
||||
@@ -18,10 +18,8 @@ __global__ void device_integer_radix_scalar_addition_inplace(
|
||||
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < num_blocks) {
|
||||
Torus scalar = scalar_input[tid];
|
||||
Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
|
||||
|
||||
*body += scalar * delta;
|
||||
lwe_array[tid * (lwe_dimension + 1) + lwe_dimension] +=
|
||||
scalar_input[tid] * delta;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,9 +43,10 @@ __host__ void host_integer_radix_scalar_addition_inplace(
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_scalar_addition_inplace<<<grid, thds, 0, streams[0]>>>(
|
||||
lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
|
||||
delta);
|
||||
device_integer_radix_scalar_addition_inplace<Torus>
|
||||
<<<grid, thds, 0, streams[0]>>>(lwe_array, scalar_input,
|
||||
input_lwe_ciphertext_count, lwe_dimension,
|
||||
delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -83,8 +82,9 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_add_scalar_one_inplace<<<grid, thds, 0, streams[0]>>>(
|
||||
lwe_array, input_lwe_ciphertext_count, lwe_dimension, delta);
|
||||
device_integer_radix_add_scalar_one_inplace<Torus>
|
||||
<<<grid, thds, 0, streams[0]>>>(lwe_array, input_lwe_ciphertext_count,
|
||||
lwe_dimension, delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -122,10 +122,10 @@ __host__ void host_integer_radix_scalar_subtraction_inplace(
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_scalar_subtraction_inplace<<<grid, thds, 0,
|
||||
streams[0]>>>(
|
||||
lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
|
||||
delta);
|
||||
device_integer_radix_scalar_subtraction_inplace<Torus>
|
||||
<<<grid, thds, 0, streams[0]>>>(lwe_array, scalar_input,
|
||||
input_lwe_ciphertext_count, lwe_dimension,
|
||||
delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -2,7 +2,58 @@
|
||||
#define CUDA_INTEGER_SCALAR_COMPARISON_OPS_CUH
|
||||
|
||||
#include "integer/comparison.cuh"
|
||||
#include <omp.h>
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scalar_compare_radix_blocks_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
if (num_radix_blocks == 0)
|
||||
return;
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
// When rhs > lhs, the subtraction will overflow, and the bit of padding will
|
||||
// be set to 1
|
||||
// meaning that the output of the pbs will be the negative (modulo message
|
||||
// space)
|
||||
//
|
||||
// Example:
|
||||
// lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
|
||||
// lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
|
||||
// Since there was an overflow the bit of padding is 1 and not 0.
|
||||
// When applying the LUT for an input value of 14 we would expect 1,
|
||||
// but since the bit of padding is 1, we will get -1 modulus our message
|
||||
// space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
|
||||
|
||||
auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
|
||||
cuda_memcpy_async_gpu_to_gpu(subtracted_blocks, lwe_array_in,
|
||||
num_radix_blocks * (big_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
// Subtract
|
||||
// Here we need the true lwe sub, not the one that comes from shortint.
|
||||
host_integer_radix_scalar_subtraction_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, subtracted_blocks, scalar_blocks,
|
||||
big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
|
||||
|
||||
// Apply LUT to compare to 0
|
||||
auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
|
||||
ksks, num_radix_blocks, sign_lut);
|
||||
|
||||
// Add one
|
||||
// Here Lhs can have the following values: (-1) % (message modulus * carry
|
||||
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
|
||||
host_integer_radix_add_scalar_one_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
|
||||
num_radix_blocks, message_modulus, carry_modulus);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
@@ -46,10 +97,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
if (total_num_scalar_blocks == 0) {
|
||||
// We only have to compare blocks with zero
|
||||
// means scalar is zero
|
||||
host_compare_with_zero_equality(streams, gpu_indexes, gpu_count,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_in,
|
||||
mem_ptr, bsks, ksks, total_num_radix_blocks,
|
||||
mem_ptr->is_zero_lut);
|
||||
host_compare_with_zero_equality<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
||||
lwe_array_in, mem_ptr, bsks, ksks, total_num_radix_blocks,
|
||||
mem_ptr->is_zero_lut);
|
||||
|
||||
auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
|
||||
x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
|
||||
@@ -87,53 +138,44 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
// Both sections may be executed in parallel
|
||||
#pragma omp section
|
||||
{
|
||||
//////////////
|
||||
// lsb
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
//////////////
|
||||
// lsb
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
total_num_scalar_blocks, message_modulus);
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks,
|
||||
message_modulus);
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
total_num_scalar_blocks, message_modulus);
|
||||
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
num_lsb_radix_blocks += (total_num_scalar_blocks % 2);
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
num_lsb_radix_blocks += (total_num_scalar_blocks % 2);
|
||||
|
||||
// comparisons will be assigned
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
// comparisons will be assigned
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
|
||||
comparisons, lhs, rhs, mem_ptr, bsks,
|
||||
ksks, num_lsb_radix_blocks);
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
|
||||
comparisons, lhs, rhs, mem_ptr, bsks,
|
||||
ksks, num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(
|
||||
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks,
|
||||
ksks, num_lsb_radix_blocks);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
//////////////
|
||||
// msb
|
||||
host_compare_with_zero_equality(
|
||||
msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb,
|
||||
mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
}
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
//////////////
|
||||
// msb
|
||||
host_compare_with_zero_equality<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb, mem_ptr,
|
||||
bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
@@ -156,7 +198,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
scalar_bivariate_last_leaf_lut_f);
|
||||
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
|
||||
lwe_array_msb_out, bsks, ksks, 1, lut, lut->params.message_modulus);
|
||||
|
||||
@@ -170,10 +212,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
|
||||
pack_blocks(streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
num_scalar_blocks, message_modulus);
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks,
|
||||
message_modulus);
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
num_scalar_blocks, message_modulus);
|
||||
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
@@ -184,16 +227,17 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
auto comparisons = mem_ptr->tmp_lwe_array_out;
|
||||
scalar_compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
|
||||
lhs, rhs, mem_ptr, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
scalar_compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count,
|
||||
comparisons, lhs, rhs, mem_ptr, bsks,
|
||||
ksks, num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
sign_handler_f, bsks, ksks, num_lsb_radix_blocks);
|
||||
tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
sign_handler_f, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -205,7 +249,6 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
|
||||
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -241,7 +284,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// We only have to compare blocks with zero
|
||||
// means scalar is zero
|
||||
Torus *are_all_msb_zeros = mem_ptr->tmp_lwe_array_out;
|
||||
host_compare_with_zero_equality(
|
||||
host_compare_with_zero_equality<Torus>(
|
||||
streams, gpu_indexes, gpu_count, are_all_msb_zeros, lwe_array_in,
|
||||
mem_ptr, bsks, ksks, total_num_radix_blocks, mem_ptr->is_zero_lut);
|
||||
Torus *sign_block =
|
||||
@@ -289,7 +332,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
scalar_bivariate_last_leaf_lut_f);
|
||||
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
|
||||
sign_block, bsks, ksks, 1, lut, lut->params.message_modulus);
|
||||
|
||||
@@ -311,101 +354,93 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
// Both sections may be executed in parallel
|
||||
#pragma omp section
|
||||
{
|
||||
//////////////
|
||||
// lsb
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
//////////////
|
||||
// lsb
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
total_num_scalar_blocks, message_modulus);
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks,
|
||||
message_modulus);
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
total_num_scalar_blocks, message_modulus);
|
||||
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
num_lsb_radix_blocks += (total_num_scalar_blocks % 2);
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
num_lsb_radix_blocks += (total_num_scalar_blocks % 2);
|
||||
|
||||
// comparisons will be assigned
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
// comparisons will be assigned
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
|
||||
comparisons, lhs, rhs, mem_ptr, bsks,
|
||||
ksks, num_lsb_radix_blocks);
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
|
||||
comparisons, lhs, rhs, mem_ptr, bsks,
|
||||
ksks, num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(
|
||||
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks,
|
||||
ksks, num_lsb_radix_blocks);
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
//////////////
|
||||
// msb
|
||||
// We remove the last block (which is the sign)
|
||||
Torus *are_all_msb_zeros = lwe_array_msb_out;
|
||||
host_compare_with_zero_equality<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, are_all_msb_zeros, msb, mem_ptr,
|
||||
bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
|
||||
auto sign_bit_pos = (int)log2(message_modulus) - 1;
|
||||
|
||||
auto lut_f = [mem_ptr, sign_bit_pos](Torus sign_block,
|
||||
Torus msb_are_zeros) {
|
||||
bool sign_bit_is_set = (sign_block >> sign_bit_pos) == 1;
|
||||
CMP_ORDERING sign_block_ordering;
|
||||
if (sign_bit_is_set) {
|
||||
sign_block_ordering = CMP_ORDERING::IS_INFERIOR;
|
||||
} else if (sign_block != 0) {
|
||||
sign_block_ordering = CMP_ORDERING::IS_SUPERIOR;
|
||||
} else {
|
||||
sign_block_ordering = CMP_ORDERING::IS_EQUAL;
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
//////////////
|
||||
// msb
|
||||
// We remove the last block (which is the sign)
|
||||
Torus *are_all_msb_zeros = lwe_array_msb_out;
|
||||
host_compare_with_zero_equality(
|
||||
msb_streams, gpu_indexes, gpu_count, are_all_msb_zeros, msb,
|
||||
mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
|
||||
auto sign_bit_pos = (int)log2(message_modulus) - 1;
|
||||
CMP_ORDERING msb_ordering;
|
||||
if (msb_are_zeros == 1)
|
||||
msb_ordering = CMP_ORDERING::IS_EQUAL;
|
||||
else
|
||||
msb_ordering = CMP_ORDERING::IS_SUPERIOR;
|
||||
|
||||
auto lut_f = [mem_ptr, sign_bit_pos](Torus sign_block,
|
||||
Torus msb_are_zeros) {
|
||||
bool sign_bit_is_set = (sign_block >> sign_bit_pos) == 1;
|
||||
CMP_ORDERING sign_block_ordering;
|
||||
if (sign_bit_is_set) {
|
||||
sign_block_ordering = CMP_ORDERING::IS_INFERIOR;
|
||||
} else if (sign_block != 0) {
|
||||
sign_block_ordering = CMP_ORDERING::IS_SUPERIOR;
|
||||
} else {
|
||||
sign_block_ordering = CMP_ORDERING::IS_EQUAL;
|
||||
}
|
||||
return mem_ptr->diff_buffer->tree_buffer->block_selector_f(
|
||||
sign_block_ordering, msb_ordering);
|
||||
};
|
||||
|
||||
CMP_ORDERING msb_ordering;
|
||||
if (msb_are_zeros == 1)
|
||||
msb_ordering = CMP_ORDERING::IS_EQUAL;
|
||||
else
|
||||
msb_ordering = CMP_ORDERING::IS_SUPERIOR;
|
||||
auto signed_msb_lut = mem_ptr->signed_msb_lut;
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
msb_streams[0], gpu_indexes[0],
|
||||
signed_msb_lut->get_lut(gpu_indexes[0], 0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f);
|
||||
signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
return mem_ptr->diff_buffer->tree_buffer->block_selector_f(
|
||||
sign_block_ordering, msb_ordering);
|
||||
};
|
||||
|
||||
auto signed_msb_lut = mem_ptr->signed_msb_lut;
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
msb_streams[0], gpu_indexes[0],
|
||||
signed_msb_lut->get_lut(gpu_indexes[0], 0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f);
|
||||
signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
|
||||
Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
|
||||
are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
|
||||
signed_msb_lut->params.message_modulus);
|
||||
}
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
|
||||
are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
|
||||
signed_msb_lut->params.message_modulus);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
//////////////
|
||||
// Reduce the two blocks into one final
|
||||
reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks, 2);
|
||||
reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks,
|
||||
2);
|
||||
|
||||
} else {
|
||||
// We only have to do the regular comparison
|
||||
@@ -422,50 +457,39 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
auto lwe_array_ct_out = mem_ptr->tmp_lwe_array_out;
|
||||
auto lwe_array_sign_out =
|
||||
lwe_array_ct_out + (num_lsb_radix_blocks / 2) * big_lwe_size;
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
// Both sections may be executed in parallel
|
||||
#pragma omp section
|
||||
{
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
Torus *lhs = diff_buffer->tmp_packed_left;
|
||||
Torus *rhs = diff_buffer->tmp_packed_right;
|
||||
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks - 1,
|
||||
message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
num_lsb_radix_blocks - 1, message_modulus);
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
|
||||
big_lwe_dimension, num_lsb_radix_blocks - 1,
|
||||
message_modulus);
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
|
||||
num_lsb_radix_blocks - 1, message_modulus);
|
||||
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
// From this point we have half number of blocks
|
||||
num_lsb_radix_blocks /= 2;
|
||||
|
||||
// comparisons will be assigned
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
|
||||
lwe_array_ct_out, lhs, rhs, mem_ptr,
|
||||
bsks, ksks, num_lsb_radix_blocks);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
Torus *encrypted_sign_block =
|
||||
lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
|
||||
Torus *scalar_sign_block =
|
||||
scalar_blocks + (total_num_scalar_blocks - 1);
|
||||
// comparisons will be assigned
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
|
||||
lwe_array_ct_out, lhs, rhs, mem_ptr,
|
||||
bsks, ksks, num_lsb_radix_blocks);
|
||||
Torus *encrypted_sign_block =
|
||||
lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
|
||||
Torus *scalar_sign_block = scalar_blocks + (total_num_scalar_blocks - 1);
|
||||
|
||||
auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
|
||||
create_trivial_radix(msb_streams[0], gpu_indexes[0], trivial_sign_block,
|
||||
scalar_sign_block, big_lwe_dimension, 1, 1,
|
||||
message_modulus, carry_modulus);
|
||||
auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
|
||||
create_trivial_radix<Torus>(
|
||||
msb_streams[0], gpu_indexes[0], trivial_sign_block, scalar_sign_block,
|
||||
big_lwe_dimension, 1, 1, message_modulus, carry_modulus);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out,
|
||||
encrypted_sign_block, trivial_sign_block, bsks, ksks, 1,
|
||||
mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
|
||||
}
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out,
|
||||
encrypted_sign_block, trivial_sign_block, bsks, ksks, 1,
|
||||
mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
@@ -473,9 +497,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks,
|
||||
num_lsb_radix_blocks + 1);
|
||||
reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks,
|
||||
num_lsb_radix_blocks + 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -486,14 +510,13 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
auto params = mem_ptr->params;
|
||||
// Calculates the difference sign between the ciphertext and the scalar
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
auto sign = mem_ptr->tmp_lwe_array_out;
|
||||
integer_radix_signed_scalar_difference_check_kb(
|
||||
integer_radix_signed_scalar_difference_check_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
|
||||
mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks,
|
||||
total_num_scalar_blocks);
|
||||
@@ -503,17 +526,17 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
|
||||
auto lwe_array_left = lwe_array_in;
|
||||
auto lwe_array_right = mem_ptr->tmp_block_comparisons;
|
||||
|
||||
create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
|
||||
scalar_blocks, params.big_lwe_dimension,
|
||||
total_num_radix_blocks, total_num_scalar_blocks,
|
||||
params.message_modulus, params.carry_modulus);
|
||||
create_trivial_radix<Torus>(streams[0], gpu_indexes[0], lwe_array_right,
|
||||
scalar_blocks, params.big_lwe_dimension,
|
||||
total_num_radix_blocks, total_num_scalar_blocks,
|
||||
params.message_modulus, params.carry_modulus);
|
||||
|
||||
// Selector
|
||||
// CMUX for Max or Min
|
||||
host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
sign, lwe_array_left, lwe_array_right,
|
||||
mem_ptr->cmux_buffer, bsks, ksks,
|
||||
total_num_radix_blocks);
|
||||
host_integer_radix_cmux_kb<Torus>(streams, gpu_indexes, gpu_count,
|
||||
lwe_array_out, sign, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsks,
|
||||
ksks, total_num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -526,12 +549,12 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
|
||||
|
||||
if (mem_ptr->is_signed) {
|
||||
// is signed and scalar is positive
|
||||
integer_radix_signed_scalar_difference_check_kb(
|
||||
integer_radix_signed_scalar_difference_check_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
|
||||
scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
|
||||
total_num_radix_blocks, total_num_scalar_blocks);
|
||||
} else {
|
||||
integer_radix_unsigned_scalar_difference_check_kb(
|
||||
integer_radix_unsigned_scalar_difference_check_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
|
||||
scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
|
||||
total_num_radix_blocks, total_num_scalar_blocks);
|
||||
@@ -547,68 +570,16 @@ __host__ void host_integer_radix_signed_scalar_maxmin_kb(
|
||||
|
||||
if (mem_ptr->is_signed) {
|
||||
// is signed and scalar is positive
|
||||
integer_radix_signed_scalar_maxmin_kb(
|
||||
integer_radix_signed_scalar_maxmin_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
|
||||
scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks,
|
||||
total_num_scalar_blocks);
|
||||
} else {
|
||||
integer_radix_unsigned_scalar_maxmin_kb(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
|
||||
scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks,
|
||||
total_num_scalar_blocks);
|
||||
PANIC("Cuda error: only signed scalar maxmin can be called in signed "
|
||||
"scalar comparison")
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scalar_compare_radix_blocks_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
// When rhs > lhs, the subtraction will overflow, and the bit of padding will
|
||||
// be set to 1
|
||||
// meaning that the output of the pbs will be the negative (modulo message
|
||||
// space)
|
||||
//
|
||||
// Example:
|
||||
// lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
|
||||
// lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
|
||||
// Since there was an overflow the bit of padding is 1 and not 0.
|
||||
// When applying the LUT for an input value of 14 we would expect 1,
|
||||
// but since the bit of padding is 1, we will get -1 modulus our message
|
||||
// space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
|
||||
|
||||
auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
|
||||
cuda_memcpy_async_gpu_to_gpu(subtracted_blocks, lwe_array_in,
|
||||
num_radix_blocks * (big_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
// Subtract
|
||||
// Here we need the true lwe sub, not the one that comes from shortint.
|
||||
host_integer_radix_scalar_subtraction_inplace(
|
||||
streams, gpu_indexes, gpu_count, subtracted_blocks, scalar_blocks,
|
||||
big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
|
||||
|
||||
// Apply LUT to compare to 0
|
||||
auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
|
||||
ksks, num_radix_blocks, sign_lut);
|
||||
|
||||
// Add one
|
||||
// Here Lhs can have the following values: (-1) % (message modulus * carry
|
||||
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
|
||||
host_integer_radix_add_scalar_one_inplace(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
|
||||
num_radix_blocks, message_modulus, carry_modulus);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -623,7 +594,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
auto sign = mem_ptr->tmp_lwe_array_out;
|
||||
host_integer_radix_scalar_difference_check_kb(
|
||||
host_integer_radix_scalar_difference_check_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
|
||||
mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks,
|
||||
total_num_scalar_blocks);
|
||||
@@ -633,17 +604,17 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
auto lwe_array_left = lwe_array_in;
|
||||
auto lwe_array_right = mem_ptr->tmp_block_comparisons;
|
||||
|
||||
create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
|
||||
scalar_blocks, params.big_lwe_dimension,
|
||||
total_num_radix_blocks, total_num_scalar_blocks,
|
||||
params.message_modulus, params.carry_modulus);
|
||||
create_trivial_radix<Torus>(streams[0], gpu_indexes[0], lwe_array_right,
|
||||
scalar_blocks, params.big_lwe_dimension,
|
||||
total_num_radix_blocks, total_num_scalar_blocks,
|
||||
params.message_modulus, params.carry_modulus);
|
||||
|
||||
// Selector
|
||||
// CMUX for Max or Min
|
||||
host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
|
||||
total_num_radix_blocks);
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
|
||||
mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -686,71 +657,61 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
auto lsb_streams = mem_ptr->lsb_streams;
|
||||
auto msb_streams = mem_ptr->msb_streams;
|
||||
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
// Both sections may be executed in parallel
|
||||
#pragma omp section
|
||||
{
|
||||
if (num_halved_scalar_blocks > 0) {
|
||||
auto packed_blocks = mem_ptr->tmp_packed_input;
|
||||
auto packed_scalar =
|
||||
packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;
|
||||
if (num_halved_scalar_blocks > 0) {
|
||||
auto packed_blocks = mem_ptr->tmp_packed_input;
|
||||
auto packed_scalar =
|
||||
packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;
|
||||
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
|
||||
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_streams[0], gpu_indexes[0], packed_scalar,
|
||||
scalar_blocks, 0, num_scalar_blocks, message_modulus);
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
|
||||
big_lwe_dimension, num_lsb_radix_blocks,
|
||||
message_modulus);
|
||||
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], packed_scalar,
|
||||
scalar_blocks, 0, num_scalar_blocks, message_modulus);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0),
|
||||
packed_scalar, num_halved_scalar_blocks * sizeof(Torus),
|
||||
lsb_streams[0], gpu_indexes[0]);
|
||||
scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0),
|
||||
packed_scalar, num_halved_scalar_blocks * sizeof(Torus), lsb_streams[0],
|
||||
gpu_indexes[0]);
|
||||
scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
|
||||
packed_blocks, bsks, ksks, num_halved_lsb_radix_blocks,
|
||||
scalar_comparison_luts);
|
||||
}
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, packed_blocks,
|
||||
bsks, ksks, num_halved_lsb_radix_blocks, scalar_comparison_luts);
|
||||
}
|
||||
//////////////
|
||||
// msb
|
||||
if (num_msb_radix_blocks > 0) {
|
||||
int_radix_lut<Torus> *msb_lut;
|
||||
switch (mem_ptr->op) {
|
||||
case COMPARISON_TYPE::EQ:
|
||||
msb_lut = mem_ptr->is_zero_lut;
|
||||
break;
|
||||
case COMPARISON_TYPE::NE:
|
||||
msb_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
//////////////
|
||||
// msb
|
||||
if (num_msb_radix_blocks > 0) {
|
||||
int_radix_lut<Torus> *msb_lut;
|
||||
switch (mem_ptr->op) {
|
||||
case COMPARISON_TYPE::EQ:
|
||||
msb_lut = mem_ptr->is_zero_lut;
|
||||
break;
|
||||
case COMPARISON_TYPE::NE:
|
||||
msb_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
}
|
||||
|
||||
host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
|
||||
lwe_array_msb_out, msb, mem_ptr, bsks,
|
||||
ksks, num_msb_radix_blocks, msb_lut);
|
||||
}
|
||||
}
|
||||
host_compare_with_zero_equality<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb, mem_ptr,
|
||||
bsks, ksks, num_msb_radix_blocks, msb_lut);
|
||||
}
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
switch (mem_ptr->op) {
|
||||
case COMPARISON_TYPE::EQ:
|
||||
are_all_comparisons_block_true(
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
|
||||
mem_ptr, bsks, ksks,
|
||||
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
|
||||
break;
|
||||
case COMPARISON_TYPE::NE:
|
||||
is_at_least_one_comparisons_block_true(
|
||||
is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
|
||||
mem_ptr, bsks, ksks,
|
||||
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
|
||||
|
||||
@@ -33,22 +33,6 @@ __host__ void scratch_cuda_integer_radix_scalar_mul_kb(
|
||||
int_scalar_mul_buffer<T> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(T);
|
||||
if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
tree_add_chunks<T, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
sm_size));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<T, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(
|
||||
cudaFuncSetAttribute(tree_add_chunks<T, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(tree_add_chunks<T, NOSM>, cudaFuncCachePreferL1);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
*mem_ptr =
|
||||
new int_scalar_mul_buffer<T>(streams, gpu_indexes, gpu_count, params,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
@@ -81,7 +65,7 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
cuda_memcpy_async_gpu_to_gpu(ptr, lwe_array,
|
||||
lwe_size_bytes * num_radix_blocks,
|
||||
streams[0], gpu_indexes[0]);
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<T>(
|
||||
streams, gpu_indexes, gpu_count, ptr, shift_amount,
|
||||
mem->logical_scalar_shift_buffer, bsks, ksks, num_radix_blocks);
|
||||
} else {
|
||||
@@ -98,15 +82,16 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
preshifted_buffer + (i % msg_bits) * num_radix_blocks * lwe_size;
|
||||
T *block_shift_buffer =
|
||||
all_shifted_buffer + j * num_radix_blocks * lwe_size;
|
||||
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
|
||||
block_shift_buffer, preshifted_radix_ct,
|
||||
i / msg_bits, num_radix_blocks, lwe_size);
|
||||
host_radix_blocks_rotate_right<T>(
|
||||
streams, gpu_indexes, gpu_count, block_shift_buffer,
|
||||
preshifted_radix_ct, i / msg_bits, num_radix_blocks, lwe_size);
|
||||
// create trivial assign for value = 0
|
||||
cuda_memset_async(block_shift_buffer, 0, (i / msg_bits) * lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]);
|
||||
j++;
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
|
||||
mem->logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
|
||||
@@ -121,10 +106,15 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
for (int i = 0; i < j * num_radix_blocks; i++) {
|
||||
terms_degree[i] = message_modulus - 1;
|
||||
}
|
||||
host_integer_sum_ciphertexts_vec_kb<T, params>(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer,
|
||||
terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem,
|
||||
num_radix_blocks, j);
|
||||
num_radix_blocks, j, nullptr);
|
||||
|
||||
auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem;
|
||||
host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count, lwe_array,
|
||||
nullptr, nullptr, scp_mem_ptr, bsks, ksks,
|
||||
num_radix_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -49,8 +49,6 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
|
||||
Torus *rotated_buffer = mem->tmp_rotated;
|
||||
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
// rotate right all the blocks in radix ciphertext
|
||||
// copy result in new buffer
|
||||
// 256 threads are used in every block
|
||||
@@ -58,9 +56,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
// one block is responsible to process single lwe ciphertext
|
||||
if (mem->shift_type == LEFT_SHIFT) {
|
||||
// rotate right as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
|
||||
num_blocks * big_lwe_size_bytes, streams[0],
|
||||
@@ -72,9 +70,11 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
|
||||
auto receiver_blocks = lwe_array;
|
||||
auto giver_blocks = rotated_buffer;
|
||||
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
|
||||
giver_blocks, lwe_array, 1, num_blocks,
|
||||
big_lwe_size);
|
||||
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
|
||||
giver_blocks, lwe_array, 1,
|
||||
num_blocks, big_lwe_size);
|
||||
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
|
||||
@@ -83,9 +83,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
|
||||
} else {
|
||||
// rotate left as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
|
||||
num_blocks * big_lwe_size_bytes, streams[0],
|
||||
@@ -97,8 +97,11 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
|
||||
auto receiver_blocks = lwe_array;
|
||||
auto giver_blocks = rotated_buffer;
|
||||
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, giver_blocks,
|
||||
lwe_array, 1, num_blocks, big_lwe_size);
|
||||
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
|
||||
giver_blocks, lwe_array, 1, num_blocks,
|
||||
big_lwe_size);
|
||||
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
#include "types/complex/operations.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <omp.h>
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_logical_scalar_shift_kb(
|
||||
@@ -52,18 +51,11 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
Torus *full_rotated_buffer = mem->tmp_rotated;
|
||||
Torus *rotated_buffer = &full_rotated_buffer[big_lwe_size];
|
||||
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
// rotate right all the blocks in radix ciphertext
|
||||
// copy result in new buffer
|
||||
// 1024 threads are used in every block
|
||||
// block_count blocks will be used in the grid
|
||||
// one block is responsible to process single lwe ciphertext
|
||||
if (mem->shift_type == LEFT_SHIFT) {
|
||||
// rotate right as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
|
||||
// create trivial assign for value = 0
|
||||
cuda_memset_async(rotated_buffer, 0, rotations * big_lwe_size_bytes,
|
||||
@@ -76,6 +68,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
return;
|
||||
}
|
||||
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
auto partial_current_blocks = &lwe_array[rotations * big_lwe_size];
|
||||
auto partial_previous_blocks =
|
||||
&full_rotated_buffer[rotations * big_lwe_size];
|
||||
@@ -90,9 +83,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
|
||||
} else {
|
||||
// right shift
|
||||
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
|
||||
// rotate left as the blocks are from LSB to MSB
|
||||
// create trivial assign for value = 0
|
||||
@@ -109,6 +102,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
|
||||
auto partial_current_blocks = lwe_array;
|
||||
auto partial_next_blocks = &rotated_buffer[big_lwe_size];
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
size_t partial_block_count = num_blocks - rotations;
|
||||
|
||||
@@ -139,8 +133,6 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
int_arithmetic_scalar_shift_buffer<Torus> *mem, void **bsks, Torus **ksks,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
@@ -160,19 +152,13 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
size_t shift_within_block = shift % num_bits_in_block;
|
||||
|
||||
Torus *rotated_buffer = mem->tmp_rotated;
|
||||
Torus *padding_block = &rotated_buffer[num_blocks * big_lwe_size];
|
||||
Torus *padding_block = &rotated_buffer[(num_blocks + 1) * big_lwe_size];
|
||||
Torus *last_block_copy = &padding_block[big_lwe_size];
|
||||
|
||||
auto lut_univariate_shift_last_block =
|
||||
mem->lut_buffers_univariate[shift_within_block - 1];
|
||||
auto lut_univariate_padding_block =
|
||||
mem->lut_buffers_univariate[num_bits_in_block - 1];
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
if (mem->shift_type == RIGHT_SHIFT) {
|
||||
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
|
||||
rotated_buffer, lwe_array, rotations,
|
||||
num_blocks, big_lwe_size);
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
|
||||
num_blocks * big_lwe_size_bytes, streams[0],
|
||||
gpu_indexes[0]);
|
||||
@@ -197,59 +183,59 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
return;
|
||||
}
|
||||
|
||||
// In the arithmetic shift case we have to pad with the value of the sign
|
||||
// bit. This creates the need for a different shifting lut than in the
|
||||
// logical shift case. We also need another PBS to create the padding block.
|
||||
Torus *last_block = lwe_array + (num_blocks - rotations - 1) * big_lwe_size;
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
last_block_copy,
|
||||
rotated_buffer + (num_blocks - rotations - 1) * big_lwe_size,
|
||||
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
auto partial_current_blocks = lwe_array;
|
||||
auto partial_next_blocks = &rotated_buffer[big_lwe_size];
|
||||
size_t partial_block_count = num_blocks - rotations;
|
||||
if (shift_within_block != 0 && rotations != num_blocks) {
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, partial_current_blocks,
|
||||
partial_current_blocks, partial_next_blocks, bsks, ksks,
|
||||
partial_block_count, lut_bivariate,
|
||||
lut_bivariate->params.message_modulus);
|
||||
}
|
||||
// Since our CPU threads will be working on different streams we shall
|
||||
// assert the work in the main stream is completed
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
// All sections may be executed in parallel
|
||||
#pragma omp section
|
||||
{
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
mem->local_streams_1, gpu_indexes, gpu_count, padding_block,
|
||||
last_block_copy, bsks, ksks, 1, lut_univariate_padding_block);
|
||||
// Replace blocks 'pulled' from the left with the correct padding block
|
||||
for (uint i = 0; i < rotations; i++) {
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
lwe_array + (num_blocks - rotations + i) * big_lwe_size,
|
||||
padding_block, big_lwe_size_bytes, mem->local_streams_1[0],
|
||||
gpu_indexes[0]);
|
||||
}
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
if (shift_within_block != 0 && rotations != num_blocks) {
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
mem->local_streams_2, gpu_indexes, gpu_count, last_block,
|
||||
last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
if (num_blocks != rotations) {
|
||||
// In the arithmetic shift case we have to pad with the value of the sign
|
||||
// bit. This creates the need for a different shifting lut than in the
|
||||
// logical shift case. We also need another PBS to create the padding
|
||||
// block.
|
||||
Torus *last_block =
|
||||
lwe_array + (num_blocks - rotations - 1) * big_lwe_size;
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
last_block_copy,
|
||||
rotated_buffer + (num_blocks - rotations - 1) * big_lwe_size,
|
||||
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
|
||||
if (shift_within_block != 0) {
|
||||
auto partial_current_blocks = lwe_array;
|
||||
auto partial_next_blocks = &rotated_buffer[big_lwe_size];
|
||||
size_t partial_block_count = num_blocks - rotations;
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, partial_current_blocks,
|
||||
partial_current_blocks, partial_next_blocks, bsks, ksks,
|
||||
partial_block_count, lut_bivariate,
|
||||
lut_bivariate->params.message_modulus);
|
||||
}
|
||||
// Since our CPU threads will be working on different streams we shall
|
||||
// assert the work in the main stream is completed
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
auto lut_univariate_padding_block =
|
||||
mem->lut_buffers_univariate[num_bits_in_block - 1];
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->local_streams_1, gpu_indexes, gpu_count, padding_block,
|
||||
last_block_copy, bsks, ksks, 1, lut_univariate_padding_block);
|
||||
// Replace blocks 'pulled' from the left with the correct padding
|
||||
// block
|
||||
for (uint i = 0; i < rotations; i++) {
|
||||
cuda_memcpy_async_gpu_to_gpu(lwe_array + (num_blocks - rotations + i) *
|
||||
big_lwe_size,
|
||||
padding_block, big_lwe_size_bytes,
|
||||
mem->local_streams_1[0], gpu_indexes[0]);
|
||||
}
|
||||
if (shift_within_block != 0) {
|
||||
auto lut_univariate_shift_last_block =
|
||||
mem->lut_buffers_univariate[shift_within_block - 1];
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->local_streams_2, gpu_indexes, gpu_count, last_block,
|
||||
last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block);
|
||||
}
|
||||
for (uint j = 0; j < mem->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
PANIC("Cuda error (scalar shift): left scalar shift is never of the "
|
||||
"arithmetic type")
|
||||
|
||||
@@ -37,8 +37,6 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
auto big_lwe_size = big_lwe_dimension + 1;
|
||||
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
|
||||
// Extract all bits
|
||||
auto bits = mem->tmp_bits;
|
||||
extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, bits, lwe_array, bsks,
|
||||
@@ -90,9 +88,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
switch (mem->shift_type) {
|
||||
case LEFT_SHIFT:
|
||||
// rotate right as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
|
||||
rotated_input, input_bits_b, rotations,
|
||||
total_nb_bits, big_lwe_size);
|
||||
host_radix_blocks_rotate_right<Torus>(
|
||||
streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
|
||||
rotations, total_nb_bits, big_lwe_size);
|
||||
|
||||
if (mem->is_signed && mem->shift_type == RIGHT_SHIFT)
|
||||
for (int i = 0; i < rotations; i++)
|
||||
@@ -105,9 +103,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
break;
|
||||
case RIGHT_SHIFT:
|
||||
// rotate left as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
|
||||
rotated_input, input_bits_b, rotations,
|
||||
total_nb_bits, big_lwe_size);
|
||||
host_radix_blocks_rotate_left<Torus>(
|
||||
streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
|
||||
rotations, total_nb_bits, big_lwe_size);
|
||||
|
||||
if (mem->is_signed)
|
||||
for (int i = 0; i < rotations; i++)
|
||||
@@ -121,38 +119,37 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
break;
|
||||
case LEFT_ROTATE:
|
||||
// rotate right as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
|
||||
rotated_input, input_bits_b, rotations,
|
||||
total_nb_bits, big_lwe_size);
|
||||
host_radix_blocks_rotate_right<Torus>(
|
||||
streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
|
||||
rotations, total_nb_bits, big_lwe_size);
|
||||
break;
|
||||
case RIGHT_ROTATE:
|
||||
// rotate left as the blocks are from LSB to MSB
|
||||
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
|
||||
rotated_input, input_bits_b, rotations,
|
||||
total_nb_bits, big_lwe_size);
|
||||
host_radix_blocks_rotate_left<Torus>(
|
||||
streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
|
||||
rotations, total_nb_bits, big_lwe_size);
|
||||
break;
|
||||
default:
|
||||
PANIC("Unknown operation")
|
||||
}
|
||||
|
||||
// pack bits into one block so that we have
|
||||
// host_pack bits into one block so that we have
|
||||
// control_bit|b|a
|
||||
cuda_memset_async(mux_inputs, 0, total_nb_bits * big_lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]); // Do we need this?
|
||||
pack_bivariate_blocks(streams, gpu_indexes, gpu_count, mux_inputs,
|
||||
mux_lut->lwe_indexes_out, rotated_input, input_bits_a,
|
||||
mux_lut->lwe_indexes_in, big_lwe_dimension, 2,
|
||||
total_nb_bits);
|
||||
pack_bivariate_blocks<Torus>(streams, gpu_indexes, gpu_count, mux_inputs,
|
||||
mux_lut->lwe_indexes_out, rotated_input,
|
||||
input_bits_a, mux_lut->lwe_indexes_in,
|
||||
big_lwe_dimension, 2, total_nb_bits);
|
||||
|
||||
// The shift bit is already properly aligned/positioned
|
||||
for (int i = 0; i < total_nb_bits; i++)
|
||||
host_addition(streams[0], gpu_indexes[0], mux_inputs + i * big_lwe_size,
|
||||
mux_inputs + i * big_lwe_size, shift_bit,
|
||||
mem->params.big_lwe_dimension, 1);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0],
|
||||
mux_inputs + i * big_lwe_size,
|
||||
mux_inputs + i * big_lwe_size, shift_bit,
|
||||
mem->params.big_lwe_dimension, 1);
|
||||
|
||||
// we have
|
||||
// control_bit|b|a
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsks, ksks,
|
||||
total_nb_bits, mux_lut);
|
||||
}
|
||||
@@ -181,8 +178,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
auto bit_to_add = input_bits_a + i * big_lwe_size;
|
||||
|
||||
for (int j = 0; j < num_radix_blocks; j++) {
|
||||
host_addition(streams[0], gpu_indexes[0], block, block, bit_to_add,
|
||||
big_lwe_dimension, 1);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], block, block, bit_to_add,
|
||||
big_lwe_dimension, 1);
|
||||
|
||||
block += big_lwe_size;
|
||||
bit_to_add += bits_per_block * big_lwe_size;
|
||||
@@ -190,7 +187,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
|
||||
// To give back a clean ciphertext
|
||||
auto cleaning_lut = mem->cleaning_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, bsks, ksks,
|
||||
num_radix_blocks, cleaning_lut);
|
||||
}
|
||||
|
||||
@@ -11,11 +11,11 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in_1),
|
||||
static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
host_addition<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in_1),
|
||||
static_cast<uint32_t *>(lwe_array_in_2),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -51,11 +51,11 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in_1),
|
||||
static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in_1),
|
||||
static_cast<uint64_t *>(lwe_array_in_2),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
}
|
||||
/*
|
||||
* Perform the addition of a u32 input LWE ciphertext vector with a u32
|
||||
@@ -66,11 +66,12 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(plaintext_array_in),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
host_addition_plaintext<uint32_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(plaintext_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
/*
|
||||
* Perform the addition of a u64 input LWE ciphertext vector with a u64 input
|
||||
@@ -105,9 +106,10 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(plaintext_array_in),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
host_addition_plaintext<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(plaintext_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
@@ -40,10 +40,10 @@ host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(output, lwe_input,
|
||||
(lwe_dimension + 1) * lwe_ciphertext_count,
|
||||
stream, gpu_index);
|
||||
plaintext_addition<<<grid, thds, 0, stream>>>(
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
output, lwe_input, (lwe_dimension + 1) * lwe_ciphertext_count * sizeof(T),
|
||||
stream, gpu_index);
|
||||
plaintext_addition<T><<<grid, thds, 0, stream>>>(
|
||||
output, lwe_input, plaintext_input, lwe_dimension, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -78,7 +78,7 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
addition<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
|
||||
addition<T><<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -112,7 +112,8 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
subtraction<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
|
||||
subtraction<T>
|
||||
<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -150,7 +151,7 @@ __host__ void host_subtraction_plaintext(cudaStream_t stream,
|
||||
(input_lwe_dimension + 1) * sizeof(T),
|
||||
stream, gpu_index);
|
||||
|
||||
radix_body_subtraction_inplace<<<grid, thds, 0, stream>>>(
|
||||
radix_body_subtraction_inplace<T><<<grid, thds, 0, stream>>>(
|
||||
output, plaintext_input, input_lwe_dimension, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -176,7 +177,6 @@ __global__ void unchecked_sub_with_correcting_term(
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
|
||||
__host__ void host_unchecked_sub_with_correcting_term(
|
||||
cudaStream_t stream, uint32_t gpu_index, T *output, T *input_1, T *input_2,
|
||||
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
@@ -193,7 +193,7 @@ __host__ void host_unchecked_sub_with_correcting_term(
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
unchecked_sub_with_correcting_term<<<grid, thds, 0, stream>>>(
|
||||
unchecked_sub_with_correcting_term<T><<<grid, thds, 0, stream>>>(
|
||||
output, input_1, input_2, num_entries, lwe_size, message_modulus,
|
||||
carry_modulus, degree);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
@@ -9,12 +9,12 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(cleartext_array_in),
|
||||
input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
host_cleartext_vec_multiplication<uint32_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(cleartext_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
/*
|
||||
* Perform the multiplication of a u64 input LWE ciphertext vector with a u64
|
||||
@@ -49,10 +49,10 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(cleartext_array_in),
|
||||
input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
host_cleartext_vec_multiplication<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(cleartext_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
@@ -14,9 +14,10 @@
|
||||
#include <vector>
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
|
||||
uint32_t input_lwe_dimension, uint32_t num_entries) {
|
||||
__global__ void cleartext_vec_multiplication(T *output, T *lwe_input,
|
||||
T *cleartext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int index = blockIdx.x * blockDim.x + tid;
|
||||
@@ -27,10 +28,46 @@ cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void
|
||||
host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index,
|
||||
T *output, T *lwe_input, T *cleartext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count * lwe_size;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
cleartext_vec_multiplication<T><<<grid, thds, 0, stream>>>(
|
||||
output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
cleartext_multiplication(T *output, T *lwe_input, T cleartext_input,
|
||||
uint32_t input_lwe_dimension, uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int index = blockIdx.x * blockDim.x + tid;
|
||||
if (index < num_entries) {
|
||||
// Here we take advantage of the wrapping behaviour of uint
|
||||
output[index] = lwe_input[index] * cleartext_input;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void
|
||||
host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
|
||||
T *output, T *lwe_input, T *cleartext_input,
|
||||
T *output, T *lwe_input, T cleartext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
@@ -45,7 +82,7 @@ host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
cleartext_multiplication<<<grid, thds, 0, stream>>>(
|
||||
cleartext_multiplication<T><<<grid, thds, 0, stream>>>(
|
||||
output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -10,10 +10,10 @@ void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_negation(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
host_negation<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -44,8 +44,8 @@ void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_negation(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
host_negation<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
@@ -37,7 +37,7 @@ __host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
negation<<<grid, thds, 0, stream>>>(output, input, num_entries);
|
||||
negation<T><<<grid, thds, 0, stream>>>(output, input, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user